{
  "baseline_summary": {
    "baseline_heads": "minimal and neural MLP heads",
    "current_use": "task design, data-contract validation, case studies, and baseline comparison",
    "split": "chronological single-episode split for public-sample diagnostics",
    "task_count": 12
  },
  "directions": [
    {
      "code": "A",
      "counts": {
        "diagnostic": 0,
        "direct": 2,
        "proxy": 2,
        "total_links": 4
      },
      "current_readout": "The sample supports hand trajectory forecasting and contact/object probes, but it does not yet include a full body/shape model or multi-person priors.",
      "current_status": "partially implemented",
      "extension_tasks": [
        {
          "current_limit": "This is a motion-energy proxy, not a SMPL/MANO body model or a generative motion prior.",
          "family": "classification",
          "id": "body_motion_intensity",
          "metric_name": "macro-F1",
          "name": "Body and Hand Motion Intensity"
        }
      ],
      "focus": "Human/hand/body motion, deformation priors, human-object interaction, affordance modeling.",
      "id": "human_motion",
      "name": "Human Modeling & Motion Understanding",
      "next_steps": [
        "Add SMPL/SMPL-X or MANO-style body/hand parameter targets where available.",
        "Train sequence models over multi-episode motion trajectories instead of isolated windows.",
        "Evaluate affordance prediction on held-out objects and held-out episodes."
      ],
      "preferred_background": "Human pose/shape estimation, SMPL-style models, motion capture, or motion generation.",
      "task_ids": [
        "timeline_action",
        "hand_trajectory_forecast",
        "contact_prediction",
        "object_relevance"
      ],
      "tasks": [
        {
          "architecture_family": "multiclass classifier",
          "case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.",
          "current_limit": "Chronological single-episode split creates unseen future action classes.",
          "direction_roles": {
            "A": "proxy",
            "C": "direct"
          },
          "display_name": "Action Recognition",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv",
              "label": "Neural predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv",
              "label": "Confusion matrix"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv",
              "label": "Neural confusion matrix"
            }
          ],
          "family": "supervised",
          "id": "timeline_action",
          "input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.",
          "input_short": "20-frame multimodal window",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "macro_f1",
            "minimal": 0.05,
            "name": "macro-F1",
            "neural_mlp": 0.0148
          },
          "modalities": [
            "video",
            "depth",
            "pose_slam",
            "motion_capture",
            "inertial",
            "language"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "current action class",
          "primary_direction": "C",
          "process_short": "window features -> action label builder -> classifier",
          "research_name": "Egocentric Action Recognition",
          "why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout."
        },
        {
          "architecture_family": "continuous regressor",
          "case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.",
          "current_limit": "Forecasting is window-level and not yet a full sequence or policy model.",
          "direction_roles": {
            "A": "direct",
            "C": "proxy"
          },
          "display_name": "Hand Trajectory Forecasting",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json",
              "label": "Neural metrics"
            }
          ],
          "family": "forecast",
          "id": "hand_trajectory_forecast",
          "input": "The current all-modality window vector at time t.",
          "input_short": "current multimodal window",
          "metric": {
            "better_baseline": "neural_mlp",
            "direction": "lower",
            "key": "mpjpe",
            "minimal": 0.8647,
            "name": "MPJPE",
            "neural_mlp": 0.1079
          },
          "modalities": [
            "motion_capture",
            "video",
            "depth",
            "pose_slam",
            "inertial"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "future hand-joint trajectory",
          "primary_direction": "A",
          "process_short": "current features -> future mocap target -> regression head",
          "research_name": "3D Hand Motion Forecasting",
          "why": "Directly predicts human hand motion and supports hand-object interaction modeling."
        },
        {
          "architecture_family": "binary classifier",
          "case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.",
          "current_limit": "The public sample is degenerate for this target because one class dominates.",
          "direction_roles": {
            "A": "direct",
            "C": "proxy"
          },
          "display_name": "Contact State Prediction",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv",
              "label": "Neural predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv",
              "label": "Confusion matrix"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv",
              "label": "Neural confusion matrix"
            }
          ],
          "family": "supervised",
          "id": "contact_prediction",
          "input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.",
          "input_short": "non-contact, non-caption features",
          "metric": {
            "better_baseline": "tie",
            "direction": "higher",
            "key": "macro_f1",
            "minimal": 1.0,
            "name": "macro-F1",
            "neural_mlp": 1.0
          },
          "modalities": [
            "motion_capture",
            "video",
            "depth",
            "inertial"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "contact or no contact",
          "primary_direction": "A",
          "process_short": "feature filter -> contact target -> binary classifier",
          "research_name": "Human-Object Contact Prediction",
          "why": "Targets physical interaction state, a core affordance and manipulation signal."
        },
        {
          "architecture_family": "multi-label classifier",
          "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.",
          "current_limit": "Object labels are language-derived and sparse in one episode.",
          "direction_roles": {
            "A": "proxy",
            "C": "direct",
            "D": "proxy"
          },
          "display_name": "Object Relevance Prediction",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv",
              "label": "Neural predictions"
            }
          ],
          "family": "supervised",
          "id": "object_relevance",
          "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.",
          "input_short": "non-caption multimodal features",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "micro_f1",
            "minimal": 0.1803,
            "name": "micro-F1",
            "neural_mlp": 0.1679
          },
          "modalities": [
            "video",
            "depth",
            "pose_slam",
            "motion_capture",
            "inertial"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "relevant object set",
          "primary_direction": "C",
          "process_short": "object vocabulary -> multi-hot labels -> sigmoid heads",
          "research_name": "Object-Centric Interaction Recognition",
          "why": "Connects egocentric activity to manipulated objects and early object-centric state."
        }
      ]
    },
    {
      "code": "B",
      "counts": {
        "diagnostic": 1,
        "direct": 0,
        "proxy": 2,
        "total_links": 3
      },
      "current_readout": "The current suite checks cross-modal alignment and depth/video reconstruction proxies; it does not yet train a renderer or reconstruct geometry.",
      "current_status": "proxy tasks only",
      "extension_tasks": [
        {
          "current_limit": "This checks calibrated multi-view signal, but it is still feature retrieval, not NeRF, Gaussian Splatting, or novel-view synthesis.",
          "family": "retrieval",
          "id": "multi_view_consistency_retrieval",
          "metric_name": "MRR",
          "name": "Multi-View Consistency Retrieval"
        }
      ],
      "focus": "Multi-view dynamic scene reconstruction, NeRF/Gaussian Splatting, novel-view synthesis.",
      "id": "reconstruction_rendering",
      "name": "3D/4D Reconstruction & Neural Rendering",
      "next_steps": [
        "Use calibrated multi-view video plus SLAM pose to build per-episode camera trajectories.",
        "Add depth-supervised point clouds, TSDF, Gaussian Splatting, or NeRF baselines.",
        "Evaluate novel-view synthesis and temporal consistency across held-out views/time."
      ],
      "preferred_background": "3D reconstruction, neural rendering, camera calibration, and bundle adjustment.",
      "task_ids": [
        "cross_modal_retrieval",
        "modality_reconstruction",
        "misalignment_detection"
      ],
      "tasks": [
        {
          "architecture_family": "two-tower retrieval head",
          "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.",
          "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
          "direction_roles": {
            "B": "proxy",
            "C": "diagnostic",
            "D": "proxy"
          },
          "display_name": "Cross-Modal Retrieval",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json",
              "label": "Neural metrics"
            }
          ],
          "family": "retrieval",
          "id": "cross_modal_retrieval",
          "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.",
          "input_short": "motion/IMU/pose query; depth/video candidates",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "mrr",
            "minimal": 0.2693,
            "name": "MRR",
            "neural_mlp": 0.13
          },
          "modalities": [
            "motion_capture",
            "inertial",
            "pose_slam",
            "depth",
            "video"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "ranked visual windows",
          "primary_direction": "C",
          "process_short": "modality split -> projection -> nearest-neighbor ranker",
          "research_name": "Multimodal Representation Retrieval",
          "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling."
        },
        {
          "architecture_family": "feature regressor",
          "case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.",
          "current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.",
          "direction_roles": {
            "B": "proxy",
            "D": "proxy"
          },
          "display_name": "Cross-Modal Reconstruction",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json",
              "label": "Neural metrics"
            }
          ],
          "family": "forecast",
          "id": "modality_reconstruction",
          "input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.",
          "input_short": "motion, IMU, and camera/pose features",
          "metric": {
            "better_baseline": "neural_mlp",
            "direction": "higher",
            "key": "r2",
            "minimal": -0.0153,
            "name": "R2",
            "neural_mlp": -0.0102
          },
          "modalities": [
            "motion_capture",
            "inertial",
            "pose_slam",
            "depth",
            "video"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "reconstructed depth/video vector",
          "primary_direction": "B",
          "process_short": "source-target split -> scaler -> regression head",
          "research_name": "Modality Feature Reconstruction",
          "why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective."
        },
        {
          "architecture_family": "pairwise classifier",
          "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.",
          "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
          "direction_roles": {
            "B": "diagnostic",
            "C": "diagnostic",
            "D": "diagnostic"
          },
          "display_name": "Multimodal Synchronization Detection",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv",
              "label": "Neural predictions"
            }
          ],
          "family": "diagnostic",
          "id": "misalignment_detection",
          "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.",
          "input_short": "motion-side and visual/depth-side feature groups",
          "metric": {
            "better_baseline": "neural_mlp",
            "direction": "higher",
            "key": "f1",
            "minimal": 0.5052,
            "name": "F1",
            "neural_mlp": 0.7153
          },
          "modalities": [
            "motion_capture",
            "inertial",
            "video",
            "depth",
            "pose_slam"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "aligned or shifted",
          "primary_direction": "C",
          "process_short": "aligned/shifted pairs -> feature combiner -> binary classifier",
          "research_name": "Cross-Modal Misalignment Detection",
          "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models."
        }
      ]
    },
    {
      "code": "C",
      "counts": {
        "diagnostic": 3,
        "direct": 6,
        "proxy": 2,
        "total_links": 11
      },
      "current_readout": "Most of the 12 tasks directly target egocentric action, task state, interaction, grounding, and alignment.",
      "current_status": "strongest implemented track",
      "extension_tasks": [
        {
          "current_limit": "This is an action-structure probe inside one episode, not a general intent model across homes, people, or tasks.",
          "family": "regression",
          "id": "action_phase_progress",
          "metric_name": "MAE",
          "name": "Action Phase Progress Estimation"
        }
      ],
      "focus": "Egocentric action and intention understanding, hand-object interaction, gaze/attention modeling, task structure modeling.",
      "id": "egocentric_interaction",
      "name": "Egocentric Vision & Interaction",
      "next_steps": [
        "Move from single-episode chronological splits to held-out-episode splits.",
        "Use the audio signal with stronger multimodal backbones for action, intent, and grounding.",
        "Evaluate long-horizon task success prediction and action-conditioned generation."
      ],
      "preferred_background": "Video understanding, action recognition, or egocentric vision.",
      "task_ids": [
        "timeline_action",
        "timeline_subtask",
        "transition_detection",
        "next_action",
        "hand_trajectory_forecast",
        "contact_prediction",
        "object_relevance",
        "caption_grounding",
        "cross_modal_retrieval",
        "temporal_order",
        "misalignment_detection"
      ],
      "tasks": [
        {
          "architecture_family": "multiclass classifier",
          "case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.",
          "current_limit": "Chronological single-episode split creates unseen future action classes.",
          "direction_roles": {
            "A": "proxy",
            "C": "direct"
          },
          "display_name": "Action Recognition",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv",
              "label": "Neural predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv",
              "label": "Confusion matrix"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv",
              "label": "Neural confusion matrix"
            }
          ],
          "family": "supervised",
          "id": "timeline_action",
          "input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.",
          "input_short": "20-frame multimodal window",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "macro_f1",
            "minimal": 0.05,
            "name": "macro-F1",
            "neural_mlp": 0.0148
          },
          "modalities": [
            "video",
            "depth",
            "pose_slam",
            "motion_capture",
            "inertial",
            "language"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "current action class",
          "primary_direction": "C",
          "process_short": "window features -> action label builder -> classifier",
          "research_name": "Egocentric Action Recognition",
          "why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout."
        },
        {
          "architecture_family": "multiclass classifier",
          "case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.",
          "current_limit": "Single-episode ordering makes future subtasks hard to generalize.",
          "direction_roles": {
            "C": "direct",
            "D": "proxy"
          },
          "display_name": "Procedure Step Recognition",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv",
              "label": "Neural predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv",
              "label": "Confusion matrix"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv",
              "label": "Neural confusion matrix"
            }
          ],
          "family": "supervised",
          "id": "timeline_subtask",
          "input": "The same all-modality window vector used by action recognition.",
          "input_short": "20-frame multimodal window",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "macro_f1",
            "minimal": 0.0506,
            "name": "macro-F1",
            "neural_mlp": 0.0281
          },
          "modalities": [
            "video",
            "depth",
            "pose_slam",
            "motion_capture",
            "inertial",
            "language"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "current procedure step",
          "primary_direction": "C",
          "process_short": "window features -> subtask label builder -> classifier",
          "research_name": "Temporal Subtask Recognition",
          "why": "Segments egocentric task state and provides a first proxy for symbolic world/task state."
        },
        {
          "architecture_family": "binary classifier",
          "case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.",
          "current_limit": "Boundary class is sparse, so accuracy alone is misleading.",
          "direction_roles": {
            "C": "direct",
            "D": "diagnostic"
          },
          "display_name": "Action Boundary Detection",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv",
              "label": "Neural predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv",
              "label": "Confusion matrix"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv",
              "label": "Neural confusion matrix"
            }
          ],
          "family": "diagnostic",
          "id": "transition_detection",
          "input": "One all-modality window vector plus labels derived from action-change timestamps.",
          "input_short": "current window with boundary target",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "macro_f1",
            "minimal": 0.6118,
            "name": "macro-F1",
            "neural_mlp": 0.5862
          },
          "modalities": [
            "video",
            "pose_slam",
            "motion_capture",
            "inertial",
            "language"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "boundary or steady",
          "primary_direction": "C",
          "process_short": "action changes -> boundary labels -> binary classifier",
          "research_name": "Temporal Action Segmentation",
          "why": "Localizes egocentric task boundaries and diagnoses temporal state changes."
        },
        {
          "architecture_family": "future-label classifier",
          "case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.",
          "current_limit": "Unseen future labels dominate the single-episode chronological test.",
          "direction_roles": {
            "C": "direct",
            "D": "proxy"
          },
          "display_name": "Next-Action Prediction",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv",
              "label": "Neural predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv",
              "label": "Confusion matrix"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv",
              "label": "Neural confusion matrix"
            }
          ],
          "family": "supervised",
          "id": "next_action",
          "input": "The current all-modality window vector at time t.",
          "input_short": "current window at time t",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "macro_f1",
            "minimal": 0.0593,
            "name": "macro-F1",
            "neural_mlp": 0.0419
          },
          "modalities": [
            "video",
            "depth",
            "pose_slam",
            "motion_capture",
            "inertial"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "action at t+20 frames",
          "primary_direction": "C",
          "process_short": "current features -> future label shift -> classifier",
          "research_name": "Short-Horizon Intention Prediction",
          "why": "Tests action intention/task-flow prediction from egocentric context."
        },
        {
          "architecture_family": "continuous regressor",
          "case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.",
          "current_limit": "Forecasting is window-level and not yet a full sequence or policy model.",
          "direction_roles": {
            "A": "direct",
            "C": "proxy"
          },
          "display_name": "Hand Trajectory Forecasting",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json",
              "label": "Neural metrics"
            }
          ],
          "family": "forecast",
          "id": "hand_trajectory_forecast",
          "input": "The current all-modality window vector at time t.",
          "input_short": "current multimodal window",
          "metric": {
            "better_baseline": "neural_mlp",
            "direction": "lower",
            "key": "mpjpe",
            "minimal": 0.8647,
            "name": "MPJPE",
            "neural_mlp": 0.1079
          },
          "modalities": [
            "motion_capture",
            "video",
            "depth",
            "pose_slam",
            "inertial"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "future hand-joint trajectory",
          "primary_direction": "A",
          "process_short": "current features -> future mocap target -> regression head",
          "research_name": "3D Hand Motion Forecasting",
          "why": "Directly predicts human hand motion and supports hand-object interaction modeling."
        },
        {
          "architecture_family": "binary classifier",
          "case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.",
          "current_limit": "The public sample is degenerate for this target because one class dominates.",
          "direction_roles": {
            "A": "direct",
            "C": "proxy"
          },
          "display_name": "Contact State Prediction",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv",
              "label": "Neural predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv",
              "label": "Confusion matrix"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv",
              "label": "Neural confusion matrix"
            }
          ],
          "family": "supervised",
          "id": "contact_prediction",
          "input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.",
          "input_short": "non-contact, non-caption features",
          "metric": {
            "better_baseline": "tie",
            "direction": "higher",
            "key": "macro_f1",
            "minimal": 1.0,
            "name": "macro-F1",
            "neural_mlp": 1.0
          },
          "modalities": [
            "motion_capture",
            "video",
            "depth",
            "inertial"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "contact or no contact",
          "primary_direction": "A",
          "process_short": "feature filter -> contact target -> binary classifier",
          "research_name": "Human-Object Contact Prediction",
          "why": "Targets physical interaction state, a core affordance and manipulation signal."
        },
        {
          "architecture_family": "multi-label classifier",
          "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.",
          "current_limit": "Object labels are language-derived and sparse in one episode.",
          "direction_roles": {
            "A": "proxy",
            "C": "direct",
            "D": "proxy"
          },
          "display_name": "Object Relevance Prediction",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv",
              "label": "Neural predictions"
            }
          ],
          "family": "supervised",
          "id": "object_relevance",
          "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.",
          "input_short": "non-caption multimodal features",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "micro_f1",
            "minimal": 0.1803,
            "name": "micro-F1",
            "neural_mlp": 0.1679
          },
          "modalities": [
            "video",
            "depth",
            "pose_slam",
            "motion_capture",
            "inertial"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "relevant object set",
          "primary_direction": "C",
          "process_short": "object vocabulary -> multi-hot labels -> sigmoid heads",
          "research_name": "Object-Centric Interaction Recognition",
          "why": "Connects egocentric activity to manipulated objects and early object-centric state."
        },
        {
          "architecture_family": "retrieval ranker",
          "case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.",
          "current_limit": "Bag-of-objects language features are too weak for rich grounding.",
          "direction_roles": {
            "C": "direct",
            "D": "proxy"
          },
          "display_name": "Language Grounding",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json",
              "label": "Neural metrics"
            }
          ],
          "family": "retrieval",
          "id": "caption_grounding",
          "input": "Caption/object/interaction query features and a set of candidate sensor-window features.",
          "input_short": "text-like query and candidate windows",
          "metric": {
            "better_baseline": "neural_mlp",
            "direction": "higher",
            "key": "mrr",
            "minimal": 0.016,
            "name": "MRR",
            "neural_mlp": 0.0168
          },
          "modalities": [
            "language",
            "video",
            "depth",
            "pose_slam"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "ranked matching moments",
          "primary_direction": "C",
          "process_short": "query features -> candidate index -> cosine ranker",
          "research_name": "Language-to-Moment Grounding",
          "why": "Grounds language annotation into egocentric sensor time and task state."
        },
        {
          "architecture_family": "two-tower retrieval head",
          "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.",
          "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
          "direction_roles": {
            "B": "proxy",
            "C": "diagnostic",
            "D": "proxy"
          },
          "display_name": "Cross-Modal Retrieval",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json",
              "label": "Neural metrics"
            }
          ],
          "family": "retrieval",
          "id": "cross_modal_retrieval",
          "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.",
          "input_short": "motion/IMU/pose query; depth/video candidates",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "mrr",
            "minimal": 0.2693,
            "name": "MRR",
            "neural_mlp": 0.13
          },
          "modalities": [
            "motion_capture",
            "inertial",
            "pose_slam",
            "depth",
            "video"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "ranked visual windows",
          "primary_direction": "C",
          "process_short": "modality split -> projection -> nearest-neighbor ranker",
          "research_name": "Multimodal Representation Retrieval",
          "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling."
        },
        {
          "architecture_family": "pairwise classifier",
          "case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.",
          "current_limit": "Only local adjacent ordering, not long-horizon causal modeling.",
          "direction_roles": {
            "C": "diagnostic",
            "D": "diagnostic"
          },
          "display_name": "Temporal Order Verification",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv",
              "label": "Neural predictions"
            }
          ],
          "family": "diagnostic",
          "id": "temporal_order",
          "input": "A pair of adjacent window vectors, plus their difference vector.",
          "input_short": "two adjacent windows plus difference vector",
          "metric": {
            "better_baseline": "neural_mlp",
            "direction": "higher",
            "key": "f1",
            "minimal": 0.54,
            "name": "F1",
            "neural_mlp": 0.852
          },
          "modalities": [
            "video",
            "pose_slam",
            "motion_capture",
            "inertial"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "correct or reversed",
          "primary_direction": "C",
          "process_short": "pair builder -> feature combiner -> binary classifier",
          "research_name": "Temporal Order Verification",
          "why": "Checks whether features encode local time direction and task progression."
        },
        {
          "architecture_family": "pairwise classifier",
          "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.",
          "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
          "direction_roles": {
            "B": "diagnostic",
            "C": "diagnostic",
            "D": "diagnostic"
          },
          "display_name": "Multimodal Synchronization Detection",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv",
              "label": "Neural predictions"
            }
          ],
          "family": "diagnostic",
          "id": "misalignment_detection",
          "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.",
          "input_short": "motion-side and visual/depth-side feature groups",
          "metric": {
            "better_baseline": "neural_mlp",
            "direction": "higher",
            "key": "f1",
            "minimal": 0.5052,
            "name": "F1",
            "neural_mlp": 0.7153
          },
          "modalities": [
            "motion_capture",
            "inertial",
            "video",
            "depth",
            "pose_slam"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "aligned or shifted",
          "primary_direction": "C",
          "process_short": "aligned/shifted pairs -> feature combiner -> binary classifier",
          "research_name": "Cross-Modal Misalignment Detection",
          "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models."
        }
      ]
    },
    {
      "code": "D",
      "counts": {
        "diagnostic": 3,
        "direct": 0,
        "proxy": 6,
        "total_links": 9
      },
      "current_readout": "The current tasks probe temporal structure, object relevance, cross-modal retrieval, and modality prediction, but they do not yet build persistent maps or scene graphs.",
      "current_status": "early proxy tasks",
      "extension_tasks": [
        {
          "current_limit": "This is a compact world-model proxy; it does not build a persistent map, scene graph, or object permanence model.",
          "family": "forecast",
          "id": "ego_motion_forecast",
          "metric_name": "MAE",
          "name": "Short-Horizon Ego-Motion Forecasting"
        }
      ],
      "focus": "Long-term consistent 3D/4D scene mapping, scene graphs, object- and space-centric representations, spatial reasoning.",
      "id": "world_modeling",
      "name": "Scene Reconstruction & World Modeling",
      "next_steps": [
        "Convert windows into persistent object/scene-state nodes with timestamps and camera poses.",
        "Add map consistency, object permanence, and spatial relation prediction tasks.",
        "Train held-out-episode world models that predict future observations and task state."
      ],
      "preferred_background": "Large-scale mapping, semantic reconstruction, or agent world models.",
      "task_ids": [
        "timeline_subtask",
        "transition_detection",
        "next_action",
        "object_relevance",
        "caption_grounding",
        "cross_modal_retrieval",
        "modality_reconstruction",
        "temporal_order",
        "misalignment_detection"
      ],
      "tasks": [
        {
          "architecture_family": "multiclass classifier",
          "case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.",
          "current_limit": "Single-episode ordering makes future subtasks hard to generalize.",
          "direction_roles": {
            "C": "direct",
            "D": "proxy"
          },
          "display_name": "Procedure Step Recognition",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv",
              "label": "Neural predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv",
              "label": "Confusion matrix"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv",
              "label": "Neural confusion matrix"
            }
          ],
          "family": "supervised",
          "id": "timeline_subtask",
          "input": "The same all-modality window vector used by action recognition.",
          "input_short": "20-frame multimodal window",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "macro_f1",
            "minimal": 0.0506,
            "name": "macro-F1",
            "neural_mlp": 0.0281
          },
          "modalities": [
            "video",
            "depth",
            "pose_slam",
            "motion_capture",
            "inertial",
            "language"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "current procedure step",
          "primary_direction": "C",
          "process_short": "window features -> subtask label builder -> classifier",
          "research_name": "Temporal Subtask Recognition",
          "why": "Segments egocentric task state and provides a first proxy for symbolic world/task state."
        },
        {
          "architecture_family": "binary classifier",
          "case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.",
          "current_limit": "Boundary class is sparse, so accuracy alone is misleading.",
          "direction_roles": {
            "C": "direct",
            "D": "diagnostic"
          },
          "display_name": "Action Boundary Detection",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv",
              "label": "Neural predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv",
              "label": "Confusion matrix"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv",
              "label": "Neural confusion matrix"
            }
          ],
          "family": "diagnostic",
          "id": "transition_detection",
          "input": "One all-modality window vector plus labels derived from action-change timestamps.",
          "input_short": "current window with boundary target",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "macro_f1",
            "minimal": 0.6118,
            "name": "macro-F1",
            "neural_mlp": 0.5862
          },
          "modalities": [
            "video",
            "pose_slam",
            "motion_capture",
            "inertial",
            "language"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "boundary or steady",
          "primary_direction": "C",
          "process_short": "action changes -> boundary labels -> binary classifier",
          "research_name": "Temporal Action Segmentation",
          "why": "Localizes egocentric task boundaries and diagnoses temporal state changes."
        },
        {
          "architecture_family": "future-label classifier",
          "case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.",
          "current_limit": "Unseen future labels dominate the single-episode chronological test.",
          "direction_roles": {
            "C": "direct",
            "D": "proxy"
          },
          "display_name": "Next-Action Prediction",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv",
              "label": "Neural predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv",
              "label": "Confusion matrix"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv",
              "label": "Neural confusion matrix"
            }
          ],
          "family": "supervised",
          "id": "next_action",
          "input": "The current all-modality window vector at time t.",
          "input_short": "current window at time t",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "macro_f1",
            "minimal": 0.0593,
            "name": "macro-F1",
            "neural_mlp": 0.0419
          },
          "modalities": [
            "video",
            "depth",
            "pose_slam",
            "motion_capture",
            "inertial"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "action at t+20 frames",
          "primary_direction": "C",
          "process_short": "current features -> future label shift -> classifier",
          "research_name": "Short-Horizon Intention Prediction",
          "why": "Tests action intention/task-flow prediction from egocentric context."
        },
        {
          "architecture_family": "multi-label classifier",
          "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.",
          "current_limit": "Object labels are language-derived and sparse in one episode.",
          "direction_roles": {
            "A": "proxy",
            "C": "direct",
            "D": "proxy"
          },
          "display_name": "Object Relevance Prediction",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv",
              "label": "Neural predictions"
            }
          ],
          "family": "supervised",
          "id": "object_relevance",
          "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.",
          "input_short": "non-caption multimodal features",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "micro_f1",
            "minimal": 0.1803,
            "name": "micro-F1",
            "neural_mlp": 0.1679
          },
          "modalities": [
            "video",
            "depth",
            "pose_slam",
            "motion_capture",
            "inertial"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "relevant object set",
          "primary_direction": "C",
          "process_short": "object vocabulary -> multi-hot labels -> sigmoid heads",
          "research_name": "Object-Centric Interaction Recognition",
          "why": "Connects egocentric activity to manipulated objects and early object-centric state."
        },
        {
          "architecture_family": "retrieval ranker",
          "case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.",
          "current_limit": "Bag-of-objects language features are too weak for rich grounding.",
          "direction_roles": {
            "C": "direct",
            "D": "proxy"
          },
          "display_name": "Language Grounding",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json",
              "label": "Neural metrics"
            }
          ],
          "family": "retrieval",
          "id": "caption_grounding",
          "input": "Caption/object/interaction query features and a set of candidate sensor-window features.",
          "input_short": "text-like query and candidate windows",
          "metric": {
            "better_baseline": "neural_mlp",
            "direction": "higher",
            "key": "mrr",
            "minimal": 0.016,
            "name": "MRR",
            "neural_mlp": 0.0168
          },
          "modalities": [
            "language",
            "video",
            "depth",
            "pose_slam"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "ranked matching moments",
          "primary_direction": "C",
          "process_short": "query features -> candidate index -> cosine ranker",
          "research_name": "Language-to-Moment Grounding",
          "why": "Grounds language annotation into egocentric sensor time and task state."
        },
        {
          "architecture_family": "two-tower retrieval head",
          "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.",
          "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
          "direction_roles": {
            "B": "proxy",
            "C": "diagnostic",
            "D": "proxy"
          },
          "display_name": "Cross-Modal Retrieval",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json",
              "label": "Neural metrics"
            }
          ],
          "family": "retrieval",
          "id": "cross_modal_retrieval",
          "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.",
          "input_short": "motion/IMU/pose query; depth/video candidates",
          "metric": {
            "better_baseline": "minimal",
            "direction": "higher",
            "key": "mrr",
            "minimal": 0.2693,
            "name": "MRR",
            "neural_mlp": 0.13
          },
          "modalities": [
            "motion_capture",
            "inertial",
            "pose_slam",
            "depth",
            "video"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "ranked visual windows",
          "primary_direction": "C",
          "process_short": "modality split -> projection -> nearest-neighbor ranker",
          "research_name": "Multimodal Representation Retrieval",
          "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling."
        },
        {
          "architecture_family": "feature regressor",
          "case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.",
          "current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.",
          "direction_roles": {
            "B": "proxy",
            "D": "proxy"
          },
          "display_name": "Cross-Modal Reconstruction",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json",
              "label": "Neural metrics"
            }
          ],
          "family": "forecast",
          "id": "modality_reconstruction",
          "input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.",
          "input_short": "motion, IMU, and camera/pose features",
          "metric": {
            "better_baseline": "neural_mlp",
            "direction": "higher",
            "key": "r2",
            "minimal": -0.0153,
            "name": "R2",
            "neural_mlp": -0.0102
          },
          "modalities": [
            "motion_capture",
            "inertial",
            "pose_slam",
            "depth",
            "video"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "reconstructed depth/video vector",
          "primary_direction": "B",
          "process_short": "source-target split -> scaler -> regression head",
          "research_name": "Modality Feature Reconstruction",
          "why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective."
        },
        {
          "architecture_family": "pairwise classifier",
          "case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.",
          "current_limit": "Only local adjacent ordering, not long-horizon causal modeling.",
          "direction_roles": {
            "C": "diagnostic",
            "D": "diagnostic"
          },
          "display_name": "Temporal Order Verification",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv",
              "label": "Neural predictions"
            }
          ],
          "family": "diagnostic",
          "id": "temporal_order",
          "input": "A pair of adjacent window vectors, plus their difference vector.",
          "input_short": "two adjacent windows plus difference vector",
          "metric": {
            "better_baseline": "neural_mlp",
            "direction": "higher",
            "key": "f1",
            "minimal": 0.54,
            "name": "F1",
            "neural_mlp": 0.852
          },
          "modalities": [
            "video",
            "pose_slam",
            "motion_capture",
            "inertial"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "correct or reversed",
          "primary_direction": "C",
          "process_short": "pair builder -> feature combiner -> binary classifier",
          "research_name": "Temporal Order Verification",
          "why": "Checks whether features encode local time direction and task progression."
        },
        {
          "architecture_family": "pairwise classifier",
          "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.",
          "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
          "direction_roles": {
            "B": "diagnostic",
            "C": "diagnostic",
            "D": "diagnostic"
          },
          "display_name": "Multimodal Synchronization Detection",
          "evidence_links": [
            {
              "href": "data/task_walkthroughs.json",
              "label": "Task walkthrough"
            },
            {
              "href": "single_episode_explorer.html",
              "label": "Single-episode explorer"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json",
              "label": "Minimal metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json",
              "label": "Neural metrics"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv",
              "label": "Minimal predictions"
            },
            {
              "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv",
              "label": "Neural predictions"
            }
          ],
          "family": "diagnostic",
          "id": "misalignment_detection",
          "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.",
          "input_short": "motion-side and visual/depth-side feature groups",
          "metric": {
            "better_baseline": "neural_mlp",
            "direction": "higher",
            "key": "f1",
            "minimal": 0.5052,
            "name": "F1",
            "neural_mlp": 0.7153
          },
          "modalities": [
            "motion_capture",
            "inertial",
            "video",
            "depth",
            "pose_slam"
          ],
          "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
          "output_short": "aligned or shifted",
          "primary_direction": "C",
          "process_short": "aligned/shifted pairs -> feature combiner -> binary classifier",
          "research_name": "Cross-Modal Misalignment Detection",
          "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models."
        }
      ]
    }
  ],
  "foundation_model_plan": {
    "decision": {
      "external_reasoning_reference": "Gemini Robotics",
      "first_policy_branch_candidates": [
        "OpenVLA / OpenVLA-OFT",
        "openpi pi0/pi0.5",
        "NVIDIA GR00T"
      ],
      "first_world_model_branch": "Cosmos 3",
      "immediate_trainable_backbone": "Qwen3-Omni"
    },
    "evaluation_additions": [
      {
        "metrics": [
          "JSON validity",
          "macro-F1",
          "accuracy",
          "micro-F1"
        ],
        "model_families": [
          "Qwen3-Omni",
          "Gemini Robotics reference"
        ],
        "target": "structured_task_prediction"
      },
      {
        "metrics": [
          "retrieval rank",
          "temporal consistency",
          "feature reconstruction",
          "qualitative visual inspection"
        ],
        "model_families": [
          "Cosmos 3"
        ],
        "target": "future_state_prediction"
      },
      {
        "metrics": [
          "transition accuracy",
          "contact accuracy",
          "next-action accuracy"
        ],
        "model_families": [
          "Cosmos 3",
          "OpenVLA",
          "openpi",
          "GR00T"
        ],
        "target": "action_conditioned_dynamics"
      },
      {
        "metrics": [
          "held-out episode metrics",
          "held-out session metrics",
          "leakage audit"
        ],
        "model_families": [
          "all trainable branches"
        ],
        "target": "cross_episode_generalization"
      }
    ],
    "execution_order": [
      {
        "action": "Stage at least 32 valid Xperience-10M episodes with held-out episode split.",
        "name": "Data gate",
        "step": 1
      },
      {
        "action": "Run Qwen3-Omni LoRA to establish the full train/eval loop.",
        "name": "First held-out baseline",
        "step": 2
      },
      {
        "action": "Run 3-8 episode dry runs for Qwen3-Omni prompt/LoRA, Cosmos 3 preprocessing, and one policy candidate.",
        "name": "Model-selection dry run",
        "step": 3
      },
      {
        "action": "Promote Cosmos 3 if future-window/action-conditioned preprocessing fits storage and compute.",
        "name": "World-model branch",
        "step": 4
      },
      {
        "action": "Promote OpenVLA/openpi/GR00T after action target conversion and retargeting artifacts are traceable.",
        "name": "Policy branch",
        "step": 5
      },
      {
        "action": "Publish branch results only with real manifests, predictions, metrics, and qualitative examples.",
        "name": "Publication rule",
        "step": 6
      }
    ],
    "model_families": [
      {
        "best_role": "First selected-episode multimodal LoRA pilot and structured task predictor.",
        "category": "omni_instruction_model",
        "current_decision": "keep_as_first_pilot",
        "entry_condition": "Selected episodes staged with held-out episode split.",
        "family": "Qwen3-Omni",
        "openness": "open_weights_available_from_official_hf_repo",
        "priority": 1,
        "public_source": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct",
        "xperience10m_fit": [
          "RGB/fisheye video, embedded audio, and language prompts can enter directly.",
          "Depth, pose/SLAM, mocap, contacts, and IMU enter through the existing sensor bridge.",
          "Matches current task outputs: labels, structured JSON, captions, and short decisions."
        ]
      },
      {
        "best_role": "Embodied world modeling, action generation, future-window prediction, and synthetic-data expansion.",
        "category": "world_foundation_model",
        "current_decision": "add_as_first_world_model_branch_after_data_gate",
        "entry_condition": "Multi-episode data plus enough storage/compute for generated or latent video-state outputs.",
        "family": "Cosmos 3",
        "openness": "track_official_nvidia_release_and_available_weights",
        "priority": 2,
        "public_source": "https://www.nvidia.com/en-us/ai/cosmos/",
        "xperience10m_fit": [
          "Uses video streams as visual state.",
          "Uses pose/SLAM, depth, mocap, IMU, and language as physical-world conditioning signals.",
          "Better aligned with prediction/generation objectives than simple label classification."
        ]
      },
      {
        "best_role": "Humanoid action understanding, retargeting, contact/action prediction, and embodied skill transfer.",
        "category": "humanoid_policy_foundation_model",
        "current_decision": "track_as_humanoid_policy_branch",
        "entry_condition": "Retargeting artifact and action-space definition exist.",
        "family": "NVIDIA GR00T",
        "openness": "track_official_nvidia_release_and_tooling",
        "priority": 3,
        "public_source": "https://developer.nvidia.com/isaac/gr00t",
        "xperience10m_fit": [
          "Hand/body mocap and contact cues can be retargeted into humanoid state/action targets.",
          "Egocentric video plus human motion can support affordance and interaction tasks."
        ]
      },
      {
        "best_role": "Open robot-policy baseline after observations and action labels are converted into a VLA format.",
        "category": "vision_language_action_policy",
        "current_decision": "candidate_after_action_space_design",
        "entry_condition": "Window-to-action-token conversion is implemented and audited.",
        "family": "OpenVLA / OpenVLA-OFT",
        "openness": "open_project_and_weights",
        "priority": 4,
        "public_source": "https://openvla.github.io/",
        "xperience10m_fit": [
          "Good candidate when each window is expressed as visual observation, instruction/context, and action token.",
          "Requires an explicit action target; current human egocentric labels are not robot controls by default."
        ]
      },
      {
        "best_role": "Action-chunking, policy fine-tuning, and embodiment-transfer experiments.",
        "category": "robot_policy_model",
        "current_decision": "candidate_policy_branch",
        "entry_condition": "Action target and train/eval protocol exist for at least 64 episodes.",
        "family": "openpi pi0/pi0.5",
        "openness": "open_source_policy_training_stack",
        "priority": 5,
        "public_source": "https://github.com/Physical-Intelligence/openpi",
        "xperience10m_fit": [
          "Useful once hand trajectories, contacts, or retargeted body motion are converted into policy targets.",
          "Better for policy branch than for current structured task JSON outputs."
        ]
      },
      {
        "best_role": "Qualitative reasoning reference, annotation helper, and external comparison when API access exists.",
        "category": "closed_embodied_reasoning_reference",
        "current_decision": "external_reference_only",
        "entry_condition": "API/access exists and outputs are logged separately from trainable model metrics.",
        "family": "Gemini Robotics",
        "openness": "closed_or_limited_access",
        "priority": 6,
        "public_source": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/",
        "xperience10m_fit": [
          "Can help reason over egocentric scenes and task descriptions.",
          "Not a local fine-tune target for this repo."
        ]
      },
      {
        "best_role": "Cheaper policy baselines for observation-to-action experiments.",
        "category": "lightweight_robot_policy_baselines",
        "current_decision": "optional_baseline_after_data_staging",
        "entry_condition": "Action labels and baseline protocol exist.",
        "family": "Octo / SmolVLA-style lightweight policies",
        "openness": "open_projects",
        "priority": 7,
        "public_source": "https://github.com/huggingface/lerobot",
        "xperience10m_fit": [
          "Useful after action target design.",
          "Less directly omni-modal than Qwen3-Omni or Cosmos 3."
        ]
      }
    ],
    "source_links": [
      {
        "label": "Qwen3-Omni official HF model",
        "url": "https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct"
      },
      {
        "label": "NVIDIA Cosmos",
        "url": "https://www.nvidia.com/en-us/ai/cosmos/"
      },
      {
        "label": "NVIDIA Isaac GR00T",
        "url": "https://developer.nvidia.com/isaac/gr00t"
      },
      {
        "label": "OpenVLA",
        "url": "https://openvla.github.io/"
      },
      {
        "label": "openpi",
        "url": "https://github.com/Physical-Intelligence/openpi"
      },
      {
        "label": "Gemini Robotics",
        "url": "https://deepmind.google/discover/blog/gemini-robotics-brings-ai-into-the-physical-world/"
      },
      {
        "label": "Octo",
        "url": "https://octo-models.github.io/"
      },
      {
        "label": "LeRobot / SmolVLA",
        "url": "https://github.com/huggingface/lerobot"
      }
    ],
    "status": "planning_artifact"
  },
  "generated_at_utc": "2026-06-03T14:43:22+00:00",
  "omni_plan": {
    "adapter": "LoRA rank 16, alpha 32, dropout 0.05",
    "backbone": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
    "evaluation": [
      "JSON validity",
      "action macro-F1",
      "subtask accuracy",
      "transition accuracy",
      "next-action accuracy",
      "contact accuracy",
      "object micro-F1",
      "held-out episode count"
    ],
    "first_pilot": "32 held-out-episode pilot after valid episodes are staged",
    "training_unit": "episode-level split, window-level supervised examples"
  },
  "phases": [
    {
      "completion_evidence": [
        "PROJECT_STATUS.md",
        "EVALUATION_PROTOCOL.md",
        "RESEARCH_TAKEAWAYS.md",
        "docs/data/summary_metrics.json",
        "results/episode_task_suite/summary_report.json"
      ],
      "deliverables": [
        "1161 aligned windows",
        "12 task contracts",
        "minimal baseline heads",
        "neural MLP heads",
        "modality atlas",
        "task walkthroughs",
        "derived figures"
      ],
      "entry_condition": "One public Xperience-10M sample episode is available.",
      "id": "public_sample_task_lab",
      "name": "Public-Sample Task Lab",
      "reader_takeaway": "The public sample supports task design, feature contracts, walkthroughs, and baseline comparisons.",
      "stage": "now",
      "status": "implemented"
    },
    {
      "completion_evidence": [
        "results/omni_finetune/DATA_ACCESS_STATUS.md",
        "results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md",
        "results/omni_finetune/source_discovery.json"
      ],
      "deliverables": [
        "128 selected episodes",
        "episode manifest",
        "missing-view manifest",
        "held-out episode split",
        "source-discovery report"
      ],
      "entry_condition": "Full-dataset access and enough storage for selected episodes.",
      "id": "multi_episode_data_staging",
      "name": "Multi-Episode Data Staging",
      "reader_takeaway": "The next scale decision is data staging, with train/test separation at the episode level.",
      "stage": "scale_up",
      "status": "active"
    },
    {
      "completion_evidence": [
        "dataset_manifest.json",
        "training_metadata.json",
        "progress.jsonl",
        "metrics.json",
        "predictions.jsonl",
        "RUN_REPORT.md"
      ],
      "deliverables": [
        "dataset JSONL/media manifests",
        "LoRA adapter checkpoint",
        "progress logs",
        "held-out predictions",
        "metrics",
        "confusion matrices",
        "run report"
      ],
      "entry_condition": "Selected episodes are staged locally with no train/test episode leakage.",
      "id": "qwen3_omni_lora_pilot",
      "name": "Qwen3-Omni LoRA Pilot",
      "reader_takeaway": "The first omni-model pilot should establish a complete held-out-episode training and evaluation loop.",
      "stage": "omni",
      "status": "next"
    },
    {
      "completion_evidence": [
        "FOUNDATION_MODEL_PLAN.md",
        "docs/data/foundation_model_plan.json",
        "research_roadmap_interactive.json"
      ],
      "deliverables": [
        "backbone registry",
        "Cosmos 3 world-model branch plan",
        "Qwen3-Omni LoRA baseline plan",
        "OpenVLA/openpi/GR00T policy-branch candidates",
        "model-specific evaluation additions"
      ],
      "entry_condition": "The selected relay is staged or a 3-8 episode dry run is staged for preprocessing checks.",
      "id": "foundation_model_selection_matrix",
      "name": "Foundation-Model Selection Matrix",
      "reader_takeaway": "Qwen3-Omni remains the first trainable held-out pilot; Cosmos 3 is the first world-model branch; VLA/policy models wait for explicit action targets.",
      "stage": "omni",
      "status": "next"
    },
    {
      "completion_evidence": [
        "held-out metrics by session",
        "held-out metrics by task",
        "held-out metrics by modality",
        "ablation tables",
        "qualitative error analysis"
      ],
      "deliverables": [
        "split-by-session metrics",
        "modality ablations",
        "calibration/object/language error analysis",
        "missing-view sensitivity analysis"
      ],
      "entry_condition": "The selected-episode pilot trains and evaluates cleanly.",
      "id": "robustness_run_64_128_episode",
      "name": "64-128 Episode Robustness Run",
      "reader_takeaway": "The robustness run tests whether the pilot conclusions survive broader sessions and missing modalities.",
      "stage": "future",
      "status": "planned"
    },
    {
      "completion_evidence": [
        "task-specific held-out evaluations",
        "qualitative inspection",
        "updated model cards"
      ],
      "deliverables": [
        "Cosmos 3 future-window or action-conditioned world-model probe",
        "OpenVLA/openpi/GR00T action-policy baseline",
        "audio/video/depth/pose/mocap conditioning audit",
        "affordance and object-interaction tasks",
        "synthetic-data usefulness test"
      ],
      "entry_condition": "Enough multi-episode data, compute budget, and model-specific action/world-state targets.",
      "id": "foundation_world_model_extensions",
      "name": "Cosmos 3 and Policy-Model Extensions",
      "reader_takeaway": "The long-term direction is richer multimodal representation learning for embodied-AI reasoning, with model branches chosen by task fit rather than by a single default backbone.",
      "stage": "future",
      "status": "planned"
    }
  ],
  "scale_up": {
    "access_status": "Full-dataset access is granted; selected multi-episode relay is in progress.",
    "candidate_scan_top_level_sessions": 802,
    "estimated_bytes": 298188841943,
    "exclude": [
      "visualization.rrd"
    ],
    "selection_strategy": "stratified_round_robin_by_top_level_session",
    "status": "selected_relay_in_progress",
    "target_episodes": 128,
    "valid_candidates": 12102
  },
  "scope": {
    "feature_blocks": 18,
    "feature_dim": 8546,
    "num_frames": 5821,
    "num_windows": 1161,
    "sample_episode_count": 1,
    "stride_frames": 5,
    "warning": "These walkthroughs explain task contracts on one public sample episode; cross-episode performance requires held-out episodes.",
    "window_frames": 20
  },
  "source_files": [
    "docs/data/research_directions.json",
    "docs/data/task_walkthroughs.json",
    "docs/data/research_roadmap.json",
    "docs/data/foundation_model_plan.json",
    "docs/data/summary_metrics.json",
    "docs/data/research_direction_extensions.json",
    "results/episode_task_suite/summary_report.json",
    "results/episode_task_suite/feature_manifest.json"
  ],
  "tasks": [
    {
      "architecture_family": "multiclass classifier",
      "case_study": "In the coffee-making sample, if the 20-frame window is during a pouring moment, the task asks the model to output an action such as Pour coffee or Pour milk into coffee.",
      "current_limit": "Chronological single-episode split creates unseen future action classes.",
      "direction_roles": {
        "A": "proxy",
        "C": "direct"
      },
      "display_name": "Action Recognition",
      "evidence_links": [
        {
          "href": "data/task_walkthroughs.json",
          "label": "Task walkthrough"
        },
        {
          "href": "single_episode_explorer.html",
          "label": "Single-episode explorer"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/metrics.json",
          "label": "Minimal metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/metrics.json",
          "label": "Neural metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/predictions.csv",
          "label": "Minimal predictions"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/predictions.csv",
          "label": "Neural predictions"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_action/confusion_matrix.csv",
          "label": "Confusion matrix"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_action/confusion_matrix.csv",
          "label": "Neural confusion matrix"
        }
      ],
      "family": "supervised",
      "id": "timeline_action",
      "input": "One 20-frame window represented by the current feature vector: video/audio/depth summaries, pose, SLAM/camera pose, motion capture, IMU, calibration, and language-derived context.",
      "input_short": "20-frame multimodal window",
      "metric": {
        "better_baseline": "minimal",
        "direction": "higher",
        "key": "macro_f1",
        "minimal": 0.05,
        "name": "macro-F1",
        "neural_mlp": 0.0148
      },
      "modalities": [
        "video",
        "depth",
        "pose_slam",
        "motion_capture",
        "inertial",
        "language"
      ],
      "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
      "output_short": "current action class",
      "primary_direction": "C",
      "process_short": "window features -> action label builder -> classifier",
      "research_name": "Egocentric Action Recognition",
      "why": "Reads egocentric sensor state as the current human action; also provides a weak human-motion readout."
    },
    {
      "architecture_family": "multiclass classifier",
      "case_study": "A pouring action may belong to a broader subtask such as preparing or pouring a drink. The model predicts that broader stage instead of a fine action.",
      "current_limit": "Single-episode ordering makes future subtasks hard to generalize.",
      "direction_roles": {
        "C": "direct",
        "D": "proxy"
      },
      "display_name": "Procedure Step Recognition",
      "evidence_links": [
        {
          "href": "data/task_walkthroughs.json",
          "label": "Task walkthrough"
        },
        {
          "href": "single_episode_explorer.html",
          "label": "Single-episode explorer"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/metrics.json",
          "label": "Minimal metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/metrics.json",
          "label": "Neural metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/predictions.csv",
          "label": "Minimal predictions"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/predictions.csv",
          "label": "Neural predictions"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/timeline_subtask/confusion_matrix.csv",
          "label": "Confusion matrix"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/timeline_subtask/confusion_matrix.csv",
          "label": "Neural confusion matrix"
        }
      ],
      "family": "supervised",
      "id": "timeline_subtask",
      "input": "The same all-modality window vector used by action recognition.",
      "input_short": "20-frame multimodal window",
      "metric": {
        "better_baseline": "minimal",
        "direction": "higher",
        "key": "macro_f1",
        "minimal": 0.0506,
        "name": "macro-F1",
        "neural_mlp": 0.0281
      },
      "modalities": [
        "video",
        "depth",
        "pose_slam",
        "motion_capture",
        "inertial",
        "language"
      ],
      "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
      "output_short": "current procedure step",
      "primary_direction": "C",
      "process_short": "window features -> subtask label builder -> classifier",
      "research_name": "Temporal Subtask Recognition",
      "why": "Segments egocentric task state and provides a first proxy for symbolic world/task state."
    },
    {
      "architecture_family": "binary classifier",
      "case_study": "When the demonstrator changes from preparing to pouring, the model should flag a boundary instead of a steady action window.",
      "current_limit": "Boundary class is sparse, so accuracy alone is misleading.",
      "direction_roles": {
        "C": "direct",
        "D": "diagnostic"
      },
      "display_name": "Action Boundary Detection",
      "evidence_links": [
        {
          "href": "data/task_walkthroughs.json",
          "label": "Task walkthrough"
        },
        {
          "href": "single_episode_explorer.html",
          "label": "Single-episode explorer"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/metrics.json",
          "label": "Minimal metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/metrics.json",
          "label": "Neural metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/predictions.csv",
          "label": "Minimal predictions"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/predictions.csv",
          "label": "Neural predictions"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/transition_detection/confusion_matrix.csv",
          "label": "Confusion matrix"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/transition_detection/confusion_matrix.csv",
          "label": "Neural confusion matrix"
        }
      ],
      "family": "diagnostic",
      "id": "transition_detection",
      "input": "One all-modality window vector plus labels derived from action-change timestamps.",
      "input_short": "current window with boundary target",
      "metric": {
        "better_baseline": "minimal",
        "direction": "higher",
        "key": "macro_f1",
        "minimal": 0.6118,
        "name": "macro-F1",
        "neural_mlp": 0.5862
      },
      "modalities": [
        "video",
        "pose_slam",
        "motion_capture",
        "inertial",
        "language"
      ],
      "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
      "output_short": "boundary or steady",
      "primary_direction": "C",
      "process_short": "action changes -> boundary labels -> binary classifier",
      "research_name": "Temporal Action Segmentation",
      "why": "Localizes egocentric task boundaries and diagnoses temporal state changes."
    },
    {
      "architecture_family": "future-label classifier",
      "case_study": "If a window shows the person preparing to pour, the target can be the action 20 frames later, such as the start of pouring.",
      "current_limit": "Unseen future labels dominate the single-episode chronological test.",
      "direction_roles": {
        "C": "direct",
        "D": "proxy"
      },
      "display_name": "Next-Action Prediction",
      "evidence_links": [
        {
          "href": "data/task_walkthroughs.json",
          "label": "Task walkthrough"
        },
        {
          "href": "single_episode_explorer.html",
          "label": "Single-episode explorer"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/metrics.json",
          "label": "Minimal metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/metrics.json",
          "label": "Neural metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/predictions.csv",
          "label": "Minimal predictions"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/predictions.csv",
          "label": "Neural predictions"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/next_action/confusion_matrix.csv",
          "label": "Confusion matrix"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/next_action/confusion_matrix.csv",
          "label": "Neural confusion matrix"
        }
      ],
      "family": "supervised",
      "id": "next_action",
      "input": "The current all-modality window vector at time t.",
      "input_short": "current window at time t",
      "metric": {
        "better_baseline": "minimal",
        "direction": "higher",
        "key": "macro_f1",
        "minimal": 0.0593,
        "name": "macro-F1",
        "neural_mlp": 0.0419
      },
      "modalities": [
        "video",
        "depth",
        "pose_slam",
        "motion_capture",
        "inertial"
      ],
      "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
      "output_short": "action at t+20 frames",
      "primary_direction": "C",
      "process_short": "current features -> future label shift -> classifier",
      "research_name": "Short-Horizon Intention Prediction",
      "why": "Tests action intention/task-flow prediction from egocentric context."
    },
    {
      "architecture_family": "continuous regressor",
      "case_study": "When the hand is moving toward a cup or bottle, the model predicts the future 3D hand-joint path.",
      "current_limit": "Forecasting is window-level and not yet a full sequence or policy model.",
      "direction_roles": {
        "A": "direct",
        "C": "proxy"
      },
      "display_name": "Hand Trajectory Forecasting",
      "evidence_links": [
        {
          "href": "data/task_walkthroughs.json",
          "label": "Task walkthrough"
        },
        {
          "href": "single_episode_explorer.html",
          "label": "Single-episode explorer"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/hand_trajectory_forecast/metrics.json",
          "label": "Minimal metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/hand_trajectory_forecast/metrics.json",
          "label": "Neural metrics"
        }
      ],
      "family": "forecast",
      "id": "hand_trajectory_forecast",
      "input": "The current all-modality window vector at time t.",
      "input_short": "current multimodal window",
      "metric": {
        "better_baseline": "neural_mlp",
        "direction": "lower",
        "key": "mpjpe",
        "minimal": 0.8647,
        "name": "MPJPE",
        "neural_mlp": 0.1079
      },
      "modalities": [
        "motion_capture",
        "video",
        "depth",
        "pose_slam",
        "inertial"
      ],
      "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
      "output_short": "future hand-joint trajectory",
      "primary_direction": "A",
      "process_short": "current features -> future mocap target -> regression head",
      "research_name": "3D Hand Motion Forecasting",
      "why": "Directly predicts human hand motion and supports hand-object interaction modeling."
    },
    {
      "architecture_family": "binary classifier",
      "case_study": "During manipulation, the hand may touch a cup, table, or bottle. The task asks whether any contact is happening.",
      "current_limit": "The public sample is degenerate for this target because one class dominates.",
      "direction_roles": {
        "A": "direct",
        "C": "proxy"
      },
      "display_name": "Contact State Prediction",
      "evidence_links": [
        {
          "href": "data/task_walkthroughs.json",
          "label": "Task walkthrough"
        },
        {
          "href": "single_episode_explorer.html",
          "label": "Single-episode explorer"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/metrics.json",
          "label": "Minimal metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/metrics.json",
          "label": "Neural metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/predictions.csv",
          "label": "Minimal predictions"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/predictions.csv",
          "label": "Neural predictions"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/contact_prediction/confusion_matrix.csv",
          "label": "Confusion matrix"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/contact_prediction/confusion_matrix.csv",
          "label": "Neural confusion matrix"
        }
      ],
      "family": "supervised",
      "id": "contact_prediction",
      "input": "Non-contact and non-caption feature blocks, so the answer is not directly leaked from the target labels.",
      "input_short": "non-contact, non-caption features",
      "metric": {
        "better_baseline": "tie",
        "direction": "higher",
        "key": "macro_f1",
        "minimal": 1.0,
        "name": "macro-F1",
        "neural_mlp": 1.0
      },
      "modalities": [
        "motion_capture",
        "video",
        "depth",
        "inertial"
      ],
      "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
      "output_short": "contact or no contact",
      "primary_direction": "A",
      "process_short": "feature filter -> contact target -> binary classifier",
      "research_name": "Human-Object Contact Prediction",
      "why": "Targets physical interaction state, a core affordance and manipulation signal."
    },
    {
      "architecture_family": "multi-label classifier",
      "case_study": "If the person is pouring milk into coffee, relevant objects may include milk, cup, coffee, or container-like items.",
      "current_limit": "Object labels are language-derived and sparse in one episode.",
      "direction_roles": {
        "A": "proxy",
        "C": "direct",
        "D": "proxy"
      },
      "display_name": "Object Relevance Prediction",
      "evidence_links": [
        {
          "href": "data/task_walkthroughs.json",
          "label": "Task walkthrough"
        },
        {
          "href": "single_episode_explorer.html",
          "label": "Single-episode explorer"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/metrics.json",
          "label": "Minimal metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/metrics.json",
          "label": "Neural metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/object_relevance/predictions.csv",
          "label": "Minimal predictions"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/object_relevance/predictions.csv",
          "label": "Neural predictions"
        }
      ],
      "family": "supervised",
      "id": "object_relevance",
      "input": "Non-caption feature blocks, so the model must infer objects from sensors rather than copying the caption words.",
      "input_short": "non-caption multimodal features",
      "metric": {
        "better_baseline": "minimal",
        "direction": "higher",
        "key": "micro_f1",
        "minimal": 0.1803,
        "name": "micro-F1",
        "neural_mlp": 0.1679
      },
      "modalities": [
        "video",
        "depth",
        "pose_slam",
        "motion_capture",
        "inertial"
      ],
      "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
      "output_short": "relevant object set",
      "primary_direction": "C",
      "process_short": "object vocabulary -> multi-hot labels -> sigmoid heads",
      "research_name": "Object-Centric Interaction Recognition",
      "why": "Connects egocentric activity to manipulated objects and early object-centric state."
    },
    {
      "architecture_family": "retrieval ranker",
      "case_study": "A query like Pour milk into coffee should rank the windows from the actual pouring moment higher than unrelated windows.",
      "current_limit": "Bag-of-objects language features are too weak for rich grounding.",
      "direction_roles": {
        "C": "direct",
        "D": "proxy"
      },
      "display_name": "Language Grounding",
      "evidence_links": [
        {
          "href": "data/task_walkthroughs.json",
          "label": "Task walkthrough"
        },
        {
          "href": "single_episode_explorer.html",
          "label": "Single-episode explorer"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/caption_grounding/metrics.json",
          "label": "Minimal metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/caption_grounding/metrics.json",
          "label": "Neural metrics"
        }
      ],
      "family": "retrieval",
      "id": "caption_grounding",
      "input": "Caption/object/interaction query features and a set of candidate sensor-window features.",
      "input_short": "text-like query and candidate windows",
      "metric": {
        "better_baseline": "neural_mlp",
        "direction": "higher",
        "key": "mrr",
        "minimal": 0.016,
        "name": "MRR",
        "neural_mlp": 0.0168
      },
      "modalities": [
        "language",
        "video",
        "depth",
        "pose_slam"
      ],
      "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
      "output_short": "ranked matching moments",
      "primary_direction": "C",
      "process_short": "query features -> candidate index -> cosine ranker",
      "research_name": "Language-to-Moment Grounding",
      "why": "Grounds language annotation into egocentric sensor time and task state."
    },
    {
      "architecture_family": "two-tower retrieval head",
      "case_study": "Use motion, IMU, and camera-pose signals from a pouring moment to retrieve the matching depth/video representation for that same moment.",
      "current_limit": "Retrieval shows an alignment signal, not geometric reconstruction.",
      "direction_roles": {
        "B": "proxy",
        "C": "diagnostic",
        "D": "proxy"
      },
      "display_name": "Cross-Modal Retrieval",
      "evidence_links": [
        {
          "href": "data/task_walkthroughs.json",
          "label": "Task walkthrough"
        },
        {
          "href": "single_episode_explorer.html",
          "label": "Single-episode explorer"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/cross_modal_retrieval/metrics.json",
          "label": "Minimal metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/cross_modal_retrieval/metrics.json",
          "label": "Neural metrics"
        }
      ],
      "family": "retrieval",
      "id": "cross_modal_retrieval",
      "input": "Query side: motion, IMU, and camera/pose features. Candidate side: depth and video features.",
      "input_short": "motion/IMU/pose query; depth/video candidates",
      "metric": {
        "better_baseline": "minimal",
        "direction": "higher",
        "key": "mrr",
        "minimal": 0.2693,
        "name": "MRR",
        "neural_mlp": 0.13
      },
      "modalities": [
        "motion_capture",
        "inertial",
        "pose_slam",
        "depth",
        "video"
      ],
      "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
      "output_short": "ranked visual windows",
      "primary_direction": "C",
      "process_short": "modality split -> projection -> nearest-neighbor ranker",
      "research_name": "Multimodal Representation Retrieval",
      "why": "Tests whether synchronized modalities identify the same 4D moment, a prerequisite for reconstruction and world modeling."
    },
    {
      "architecture_family": "feature regressor",
      "case_study": "Given motion, IMU, and camera-pose signals while the hand moves, predict the matching depth/video feature vector.",
      "current_limit": "Feature-vector reconstruction is not pixel, depth-map, mesh, NeRF, or Gaussian reconstruction.",
      "direction_roles": {
        "B": "proxy",
        "D": "proxy"
      },
      "display_name": "Cross-Modal Reconstruction",
      "evidence_links": [
        {
          "href": "data/task_walkthroughs.json",
          "label": "Task walkthrough"
        },
        {
          "href": "single_episode_explorer.html",
          "label": "Single-episode explorer"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/modality_reconstruction/metrics.json",
          "label": "Minimal metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/modality_reconstruction/metrics.json",
          "label": "Neural metrics"
        }
      ],
      "family": "forecast",
      "id": "modality_reconstruction",
      "input": "Motion, IMU, and camera/pose features as input; depth/video features as the regression target.",
      "input_short": "motion, IMU, and camera/pose features",
      "metric": {
        "better_baseline": "neural_mlp",
        "direction": "higher",
        "key": "r2",
        "minimal": -0.0153,
        "name": "R2",
        "neural_mlp": -0.0102
      },
      "modalities": [
        "motion_capture",
        "inertial",
        "pose_slam",
        "depth",
        "video"
      ],
      "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
      "output_short": "reconstructed depth/video vector",
      "primary_direction": "B",
      "process_short": "source-target split -> scaler -> regression head",
      "research_name": "Modality Feature Reconstruction",
      "why": "Predicts visual/depth state from non-target sensors as a weak reconstruction/world-model objective."
    },
    {
      "architecture_family": "pairwise classifier",
      "case_study": "If window A shows reaching and window B shows pouring, the model should distinguish A then B from B then A.",
      "current_limit": "Only local adjacent ordering, not long-horizon causal modeling.",
      "direction_roles": {
        "C": "diagnostic",
        "D": "diagnostic"
      },
      "display_name": "Temporal Order Verification",
      "evidence_links": [
        {
          "href": "data/task_walkthroughs.json",
          "label": "Task walkthrough"
        },
        {
          "href": "single_episode_explorer.html",
          "label": "Single-episode explorer"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/metrics.json",
          "label": "Minimal metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/metrics.json",
          "label": "Neural metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/temporal_order/predictions.csv",
          "label": "Minimal predictions"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/temporal_order/predictions.csv",
          "label": "Neural predictions"
        }
      ],
      "family": "diagnostic",
      "id": "temporal_order",
      "input": "A pair of adjacent window vectors, plus their difference vector.",
      "input_short": "two adjacent windows plus difference vector",
      "metric": {
        "better_baseline": "neural_mlp",
        "direction": "higher",
        "key": "f1",
        "minimal": 0.54,
        "name": "F1",
        "neural_mlp": 0.852
      },
      "modalities": [
        "video",
        "pose_slam",
        "motion_capture",
        "inertial"
      ],
      "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
      "output_short": "correct or reversed",
      "primary_direction": "C",
      "process_short": "pair builder -> feature combiner -> binary classifier",
      "research_name": "Temporal Order Verification",
      "why": "Checks whether features encode local time direction and task progression."
    },
    {
      "architecture_family": "pairwise classifier",
      "case_study": "Motion from a pouring moment is paired with video/depth from several windows later. The task asks the model to detect that mismatch.",
      "current_limit": "Synthetic shifts diagnose alignment but do not solve calibration or mapping.",
      "direction_roles": {
        "B": "diagnostic",
        "C": "diagnostic",
        "D": "diagnostic"
      },
      "display_name": "Multimodal Synchronization Detection",
      "evidence_links": [
        {
          "href": "data/task_walkthroughs.json",
          "label": "Task walkthrough"
        },
        {
          "href": "single_episode_explorer.html",
          "label": "Single-episode explorer"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/metrics.json",
          "label": "Minimal metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/metrics.json",
          "label": "Neural metrics"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/misalignment_detection/predictions.csv",
          "label": "Minimal predictions"
        },
        {
          "href": "https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/results/episode_task_suite/neural_mlp/misalignment_detection/predictions.csv",
          "label": "Neural predictions"
        }
      ],
      "family": "diagnostic",
      "id": "misalignment_detection",
      "input": "A motion-side feature group and a visual/depth-side feature group, either aligned or artificially shifted.",
      "input_short": "motion-side and visual/depth-side feature groups",
      "metric": {
        "better_baseline": "neural_mlp",
        "direction": "higher",
        "key": "f1",
        "minimal": 0.5052,
        "name": "F1",
        "neural_mlp": 0.7153
      },
      "modalities": [
        "motion_capture",
        "inertial",
        "video",
        "depth",
        "pose_slam"
      ],
      "module_summary": "input window -> feature/target builder -> baseline head -> evaluator -> artifact files",
      "output_short": "aligned or shifted",
      "primary_direction": "C",
      "process_short": "aligned/shifted pairs -> feature combiner -> binary classifier",
      "research_name": "Cross-Modal Misalignment Detection",
      "why": "Detects temporal desynchronization, a key data-quality gate for multimodal reconstruction and world models."
    }
  ],
  "title": "Interactive Research Roadmap"
}
