{
  "omni_relay": {
    "status": "pending_huggingface_gated_access",
    "dataset": "ropedia-ai/xperience-10m",
    "staging": "prepared_generic_host_to_host_transfer",
    "training_target": "external_multi_gpu_training_host",
    "selection_strategy": "stratified_round_robin_by_top_level_session",
    "target_episodes": 32,
    "selected_sessions": 32,
    "candidate_scan_top_level_sessions": 64,
    "valid_candidates": 680,
    "estimated_bytes": 72031620552,
    "exclude": [
      "visualization.rrd"
    ],
    "blocker": "Hugging Face returns 403 pending review for the full Xperience-10M gated dataset.",
    "claim_boundary": "No real 32-episode fine-tune is claimed until gated data is available locally and held-out evaluation runs."
  },
  "models": {
    "motion_action": {
      "accuracy": 0.9828178694158075,
      "balanced_accuracy": 0.9643518518518519,
      "macro_f1": 0.96884342657456,
      "weighted_f1": 0.9824311468352843,
      "num_eval_windows": 291,
      "num_classes": 18,
      "majority_baseline_accuracy": 0.13745704467353953,
      "train_final_accuracy": 1.0,
      "train_final_loss": 0.019042566418647766
    },
    "motion_subtask": {
      "accuracy": 0.9758620689655172,
      "balanced_accuracy": 0.9783924095954172,
      "macro_f1": 0.9528048001232955,
      "weighted_f1": 0.9778836359351952,
      "num_eval_windows": 290,
      "num_classes": 14,
      "majority_baseline_accuracy": 0.14482758620689656,
      "train_final_accuracy": 1.0,
      "train_final_loss": 0.02664567530155182
    },
    "all_modalities_action": {
      "accuracy": 0.9828178694158075,
      "balanced_accuracy": 0.9800925925925925,
      "macro_f1": 0.9791023658779895,
      "weighted_f1": 0.98276563540562,
      "num_eval_windows": 291,
      "num_classes": 18,
      "majority_baseline_accuracy": 0.13745704467353953,
      "train_final_accuracy": 1.0,
      "train_final_loss": 0.014624637551605701,
      "feature_dim": 8378,
      "num_windows": 1144
    },
    "all_modalities_subtask": {
      "accuracy": 0.9827586206896551,
      "balanced_accuracy": 0.9505102040816327,
      "macro_f1": 0.9307645963773675,
      "weighted_f1": 0.9837987833808578,
      "num_eval_windows": 290,
      "num_classes": 14,
      "majority_baseline_accuracy": 0.14482758620689656,
      "train_final_accuracy": 1.0,
      "train_final_loss": 0.012823422439396381,
      "feature_dim": 8378,
      "num_windows": 1147
    }
  },
  "suite": {
    "annotation": "data/sample/xperience-10m-sample/annotation.hdf5",
    "num_frames": 5821,
    "num_windows": 1161,
    "feature_dim": 8378,
    "window_frames": 20,
    "stride_frames": 5,
    "tasks": {
      "timeline_action": {
        "accuracy": 0.029154518950437316,
        "balanced_accuracy": 0.03125,
        "macro_f1": 0.05,
        "weighted_f1": 0.04664723032069971,
        "num_eval_windows": 343,
        "num_classes": 18,
        "task": "timeline_action",
        "input": "all modalities -> current action label",
        "split": "chronological",
        "num_windows": 1144,
        "num_train_windows": 801,
        "num_test_windows": 343,
        "feature_dim": 8378,
        "majority_baseline_accuracy": 0.0,
        "train_final_accuracy": 1.0,
        "train_final_loss": 0.01664665900170803,
        "unseen_test_classes": [
          "Place item on table",
          "Pour coffee",
          "Pour milk into coffee",
          "Wait/Prepare for pouring"
        ]
      },
      "timeline_subtask": {
        "accuracy": 0.05813953488372093,
        "balanced_accuracy": 0.05376979652090881,
        "macro_f1": 0.04954121121178666,
        "weighted_f1": 0.06731304264454903,
        "num_eval_windows": 344,
        "num_classes": 14,
        "task": "timeline_subtask",
        "input": "all modalities -> current subtask label",
        "split": "chronological",
        "num_windows": 1147,
        "num_train_windows": 803,
        "num_test_windows": 344,
        "feature_dim": 8378,
        "majority_baseline_accuracy": 0.0,
        "train_final_accuracy": 1.0,
        "train_final_loss": 0.014040183275938034,
        "unseen_test_classes": [
          "Move bottle to coffee equipment",
          "Pour coffee",
          "Pour milk into coffee",
          "Prepare for pouring"
        ]
      },
      "transition_detection": {
        "accuracy": 0.9252873563218391,
        "balanced_accuracy": 0.6931475903614458,
        "macro_f1": 0.6551829268292684,
        "weighted_f1": 0.9323030557891787,
        "num_eval_windows": 348,
        "num_classes": 2,
        "task": "transition_detection",
        "input": "all modalities -> action boundary/steady",
        "split": "chronological",
        "num_windows": 1161,
        "num_train_windows": 813,
        "num_test_windows": 348,
        "feature_dim": 8378,
        "majority_baseline_accuracy": 0.9540229885057471,
        "train_final_accuracy": 1.0,
        "train_final_loss": 0.007071746978908777,
        "unseen_test_classes": [],
        "boundary_precision": 0.125,
        "boundary_recall": 0.75,
        "boundary_f1": 0.21428571428571427,
        "matched_boundaries": 3,
        "true_boundaries": 4,
        "predicted_boundaries": 24,
        "mean_abs_timing_error_frames": 2.6666666666666665
      },
      "next_action": {
        "accuracy": 0.034482758620689655,
        "balanced_accuracy": 0.04,
        "macro_f1": 0.05925925925925927,
        "weighted_f1": 0.05108556832694764,
        "num_eval_windows": 348,
        "num_classes": 18,
        "task": "next_action",
        "input": "all modalities at t -> action at t+20 frames",
        "split": "chronological",
        "num_windows": 1161,
        "num_train_windows": 813,
        "num_test_windows": 348,
        "feature_dim": 8378,
        "majority_baseline_accuracy": 0.0,
        "train_final_accuracy": 1.0,
        "train_final_loss": 0.017629079520702362,
        "unseen_test_classes": [
          "Place item on table",
          "Pour coffee",
          "Pour milk into coffee",
          "Wait/Prepare for pouring"
        ]
      },
      "hand_trajectory_forecast": {
        "mse": 11.323140144348145,
        "mae": 0.40246668457984924,
        "r2": -1334.788993815828,
        "task": "hand_trajectory_forecast",
        "input": "all modalities at t -> future left/right hand 3D joints",
        "split": "chronological",
        "num_windows": 1159,
        "num_train_windows": 811,
        "num_test_windows": 348,
        "forecast_frames": 10,
        "mpjpe": 0.8222644925117493,
        "final_frame_mpjpe": 1.0649521350860596,
        "target_dim": 1260
      },
      "contact_prediction": {
        "accuracy": 1.0,
        "balanced_accuracy": 1.0,
        "macro_f1": 1.0,
        "weighted_f1": 1.0,
        "num_eval_windows": 348,
        "num_classes": 1,
        "task": "contact_prediction",
        "input": "all non-contact/non-caption-label modalities -> any body contact",
        "split": "chronological",
        "num_windows": 1161,
        "num_train_windows": 813,
        "num_test_windows": 348,
        "feature_dim": 7335,
        "majority_baseline_accuracy": 1.0,
        "train_final_accuracy": 1.0,
        "train_final_loss": 0.0005947681493125856,
        "unseen_test_classes": []
      },
      "object_relevance": {
        "micro_f1": 0.18393030009680542,
        "macro_f1": 0.06427052187996415,
        "exact_match": 0.005747126436781609,
        "precision": 0.16360505166475317,
        "recall": 0.21002210759027265,
        "task": "object_relevance",
        "input": "all non-caption modalities -> current relevant object set",
        "split": "chronological",
        "num_windows": 1161,
        "num_train_windows": 813,
        "num_test_windows": 348,
        "num_objects": 34
      },
      "caption_grounding": {
        "mrr": 0.017183946083791223,
        "median_rank": 167.0,
        "mean_rank": 174.39367816091954,
        "num_queries": 348,
        "top1_accuracy": 0.0028735632183908046,
        "top5_accuracy": 0.011494252873563218,
        "top10_accuracy": 0.017241379310344827,
        "task": "caption_grounding",
        "input": "caption objects/interaction text query + candidate sensor windows",
        "output": "matching time window",
        "split": "chronological",
        "num_train_windows": 813,
        "num_test_windows": 348
      },
      "cross_modal_retrieval": {
        "mrr": 0.26335984006618296,
        "median_rank": 12.5,
        "mean_rank": 43.33045977011494,
        "num_queries": 348,
        "top1_accuracy": 0.14942528735632185,
        "top5_accuracy": 0.3764367816091954,
        "top10_accuracy": 0.47413793103448276,
        "task": "cross_modal_retrieval",
        "input": "motion/IMU/camera query",
        "output": "matching depth/video window",
        "split": "chronological",
        "num_train_windows": 813,
        "num_test_windows": 348
      },
      "modality_reconstruction": {
        "mse": 1359.1639404296875,
        "mae": 0.31084805727005005,
        "r2": -0.016022846771134747,
        "task": "modality_reconstruction",
        "input": "motion/IMU/camera",
        "output": "depth/video feature vector",
        "split": "chronological",
        "num_train_windows": 813,
        "num_test_windows": 348,
        "target_dim": 5096
      },
      "temporal_order": {
        "accuracy": 0.46120689655172414,
        "precision": 0.4720496894409938,
        "recall": 0.6551724137931034,
        "f1": 0.5487364620938628,
        "tp": 228,
        "tn": 93,
        "fp": 255,
        "fn": 120,
        "positive_rate_true": 0.5,
        "positive_rate_pred": 0.6939655172413793,
        "task": "temporal_order",
        "input": "two adjacent windows -> whether order is correct",
        "split": "chronological",
        "num_samples": 2320,
        "num_train_samples": 1624,
        "num_test_samples": 696,
        "train_final_accuracy": 0.5104679802955665
      },
      "misalignment_detection": {
        "accuracy": 0.5028901734104047,
        "precision": 0.5030864197530864,
        "recall": 0.47109826589595377,
        "f1": 0.4865671641791045,
        "tp": 163,
        "tn": 185,
        "fp": 161,
        "fn": 183,
        "positive_rate_true": 0.5,
        "positive_rate_pred": 0.4682080924855491,
        "task": "misalignment_detection",
        "input": "motion+visual pair -> aligned vs shifted by 8 windows",
        "split": "chronological",
        "num_samples": 2306,
        "num_train_samples": 1614,
        "num_test_samples": 692,
        "train_final_accuracy": 0.5018587360594795
      }
    },
    "neural_model": {
      "name": "neural_mlp",
      "type": "lightweight PyTorch MLP over shared window features",
      "epochs": 80,
      "hidden_dim": 128,
      "batch_size": 128,
      "learning_rate": 0.001,
      "weight_decay": 0.0001,
      "dropout": 0.1,
      "device": "auto"
    },
    "neural_tasks": {
      "timeline_action": {
        "accuracy": 0.014577259475218658,
        "balanced_accuracy": 0.015625,
        "macro_f1": 0.02631578947368421,
        "weighted_f1": 0.024551173852999847,
        "num_eval_windows": 343,
        "num_classes": 18,
        "task": "timeline_action",
        "input": "all modalities -> current action label",
        "split": "chronological",
        "num_windows": 1144,
        "num_train_windows": 801,
        "num_test_windows": 343,
        "feature_dim": 8378,
        "majority_baseline_accuracy": 0.0,
        "unseen_test_classes": [
          "Place item on table",
          "Pour coffee",
          "Pour milk into coffee",
          "Wait/Prepare for pouring"
        ],
        "model": "neural_mlp",
        "head": "z-score -> MLP softmax",
        "neural_epochs": 80,
        "neural_hidden_dim": 128,
        "neural_batch_size": 128,
        "neural_learning_rate": 0.001,
        "neural_weight_decay": 0.0001,
        "neural_dropout": 0.1,
        "neural_device": "cpu",
        "train_final_loss": 0.0001524056650931597,
        "train_final_accuracy": 1.0
      },
      "timeline_subtask": {
        "accuracy": 0.01744186046511628,
        "balanced_accuracy": 0.021052631578947368,
        "macro_f1": 0.017518248175182476,
        "weighted_f1": 0.014513664912578507,
        "num_eval_windows": 344,
        "num_classes": 14,
        "task": "timeline_subtask",
        "input": "all modalities -> current subtask label",
        "split": "chronological",
        "num_windows": 1147,
        "num_train_windows": 803,
        "num_test_windows": 344,
        "feature_dim": 8378,
        "majority_baseline_accuracy": 0.0,
        "unseen_test_classes": [
          "Move bottle to coffee equipment",
          "Pour coffee",
          "Pour milk into coffee",
          "Prepare for pouring"
        ],
        "model": "neural_mlp",
        "head": "z-score -> MLP softmax",
        "neural_epochs": 80,
        "neural_hidden_dim": 128,
        "neural_batch_size": 128,
        "neural_learning_rate": 0.001,
        "neural_weight_decay": 0.0001,
        "neural_dropout": 0.1,
        "neural_device": "cpu",
        "train_final_loss": 0.06660133146519678,
        "train_final_accuracy": 0.9912826899128269
      },
      "transition_detection": {
        "accuracy": 0.9310344827586207,
        "balanced_accuracy": 0.6664156626506024,
        "macro_f1": 0.6484848484848484,
        "weighted_f1": 0.9346569139672588,
        "num_eval_windows": 348,
        "num_classes": 2,
        "task": "transition_detection",
        "input": "all modalities -> action boundary/steady",
        "split": "chronological",
        "num_windows": 1161,
        "num_train_windows": 813,
        "num_test_windows": 348,
        "feature_dim": 8378,
        "majority_baseline_accuracy": 0.9540229885057471,
        "unseen_test_classes": [],
        "model": "neural_mlp",
        "head": "z-score -> MLP softmax",
        "neural_epochs": 80,
        "neural_hidden_dim": 128,
        "neural_batch_size": 128,
        "neural_learning_rate": 0.001,
        "neural_weight_decay": 0.0001,
        "neural_dropout": 0.1,
        "neural_device": "cpu",
        "train_final_loss": 0.005629667796936003,
        "train_final_accuracy": 0.998769987699877,
        "boundary_precision": 0.1,
        "boundary_recall": 0.5,
        "boundary_f1": 0.16666666666666669,
        "matched_boundaries": 2,
        "true_boundaries": 4,
        "predicted_boundaries": 20,
        "mean_abs_timing_error_frames": 5.0
      },
      "next_action": {
        "accuracy": 0.011494252873563218,
        "balanced_accuracy": 0.013333333333333332,
        "macro_f1": 0.023529411764705882,
        "weighted_f1": 0.02028397565922921,
        "num_eval_windows": 348,
        "num_classes": 18,
        "task": "next_action",
        "input": "all modalities at t -> action at t+20 frames",
        "split": "chronological",
        "num_windows": 1161,
        "num_train_windows": 813,
        "num_test_windows": 348,
        "feature_dim": 8378,
        "majority_baseline_accuracy": 0.0,
        "unseen_test_classes": [
          "Place item on table",
          "Pour coffee",
          "Pour milk into coffee",
          "Wait/Prepare for pouring"
        ],
        "model": "neural_mlp",
        "head": "z-score -> MLP softmax",
        "neural_epochs": 80,
        "neural_hidden_dim": 128,
        "neural_batch_size": 128,
        "neural_learning_rate": 0.001,
        "neural_weight_decay": 0.0001,
        "neural_dropout": 0.1,
        "neural_device": "cpu",
        "train_final_loss": 0.0050763053797378156,
        "train_final_accuracy": 0.998769987699877
      },
      "hand_trajectory_forecast": {
        "mse": 0.005083972588181496,
        "mae": 0.055900074541568756,
        "r2": 0.40024460814419005,
        "task": "hand_trajectory_forecast",
        "input": "all modalities at t -> future left/right hand 3D joints",
        "split": "chronological",
        "num_windows": 1159,
        "num_train_windows": 811,
        "num_test_windows": 348,
        "forecast_frames": 10,
        "mpjpe": 0.11163123697042465,
        "final_frame_mpjpe": 0.11860372871160507,
        "target_dim": 1260,
        "model": "neural_mlp",
        "head": "z-score -> MLP regression",
        "neural_epochs": 80,
        "neural_hidden_dim": 128,
        "neural_batch_size": 128,
        "neural_learning_rate": 0.001,
        "neural_weight_decay": 0.0001,
        "neural_dropout": 0.1,
        "neural_device": "cpu",
        "train_final_loss": 0.059780220692901516
      },
      "contact_prediction": {
        "accuracy": 1.0,
        "balanced_accuracy": 1.0,
        "macro_f1": 1.0,
        "weighted_f1": 1.0,
        "num_eval_windows": 348,
        "num_classes": 1,
        "task": "contact_prediction",
        "input": "all non-contact/non-caption-label modalities -> any body contact",
        "split": "chronological",
        "num_windows": 1161,
        "num_train_windows": 813,
        "num_test_windows": 348,
        "feature_dim": 7335,
        "majority_baseline_accuracy": 1.0,
        "unseen_test_classes": [],
        "model": "neural_mlp",
        "head": "z-score -> MLP softmax",
        "neural_epochs": 80,
        "neural_hidden_dim": 128,
        "neural_batch_size": 128,
        "neural_learning_rate": 0.001,
        "neural_weight_decay": 0.0001,
        "neural_dropout": 0.1,
        "neural_device": "cpu",
        "train_final_loss": 0.0,
        "train_final_accuracy": 1.0
      },
      "object_relevance": {
        "micro_f1": 0.1797583081570997,
        "macro_f1": 0.04958769134098823,
        "exact_match": 0.011494252873563218,
        "precision": 0.18435321456235476,
        "recall": 0.17538688282977155,
        "task": "object_relevance",
        "input": "all non-caption modalities -> current relevant object set",
        "split": "chronological",
        "num_windows": 1161,
        "num_train_windows": 813,
        "num_test_windows": 348,
        "num_objects": 34,
        "feature_dim": 7482,
        "model": "neural_mlp",
        "head": "z-score -> MLP sigmoid multilabel",
        "neural_epochs": 80,
        "neural_hidden_dim": 128,
        "neural_batch_size": 128,
        "neural_learning_rate": 0.001,
        "neural_weight_decay": 0.0001,
        "neural_dropout": 0.1,
        "neural_device": "cpu",
        "train_final_loss": 0.0006806955548115865
      },
      "caption_grounding": {
        "mrr": 0.01781111161035397,
        "median_rank": 184.0,
        "mean_rank": 183.86206896551724,
        "num_queries": 348,
        "top1_accuracy": 0.005747126436781609,
        "top5_accuracy": 0.017241379310344827,
        "top10_accuracy": 0.02586206896551724,
        "task": "caption_grounding",
        "input": "caption objects/interaction text query + candidate sensor windows",
        "split": "chronological",
        "num_train_windows": 813,
        "num_test_windows": 348,
        "target_dim": 896,
        "output": "matching time window",
        "model": "neural_mlp",
        "head": "z-score -> MLP projection/regression",
        "neural_epochs": 80,
        "neural_hidden_dim": 128,
        "neural_batch_size": 128,
        "neural_learning_rate": 0.001,
        "neural_weight_decay": 0.0001,
        "neural_dropout": 0.1,
        "neural_device": "cpu",
        "train_final_loss": 0.06571704036525254
      },
      "cross_modal_retrieval": {
        "mrr": 0.1530070022204131,
        "median_rank": 34.0,
        "mean_rank": 62.043103448275865,
        "num_queries": 348,
        "top1_accuracy": 0.07183908045977011,
        "top5_accuracy": 0.21551724137931033,
        "top10_accuracy": 0.3017241379310345,
        "task": "cross_modal_retrieval",
        "input": "motion/IMU/camera query",
        "split": "chronological",
        "num_train_windows": 813,
        "num_test_windows": 348,
        "target_dim": 5096,
        "output": "matching depth/video window",
        "model": "neural_mlp",
        "head": "z-score -> MLP projection/regression",
        "neural_epochs": 80,
        "neural_hidden_dim": 128,
        "neural_batch_size": 128,
        "neural_learning_rate": 0.001,
        "neural_weight_decay": 0.0001,
        "neural_dropout": 0.1,
        "neural_device": "cpu",
        "train_final_loss": 0.2246821296537641
      },
      "modality_reconstruction": {
        "mse": 1351.3720703125,
        "mae": 0.10358995944261551,
        "r2": -0.010198171891414143,
        "task": "modality_reconstruction",
        "input": "motion/IMU/camera",
        "split": "chronological",
        "num_train_windows": 813,
        "num_test_windows": 348,
        "target_dim": 5096,
        "output": "depth/video feature vector",
        "model": "neural_mlp",
        "head": "z-score -> MLP projection/regression",
        "neural_epochs": 80,
        "neural_hidden_dim": 128,
        "neural_batch_size": 128,
        "neural_learning_rate": 0.001,
        "neural_weight_decay": 0.0001,
        "neural_dropout": 0.1,
        "neural_device": "cpu",
        "train_final_loss": 0.2246821296537641
      },
      "temporal_order": {
        "accuracy": 0.8706896551724138,
        "precision": 0.864406779661017,
        "recall": 0.8793103448275862,
        "f1": 0.8717948717948718,
        "tp": 306,
        "tn": 300,
        "fp": 48,
        "fn": 42,
        "positive_rate_true": 0.5,
        "positive_rate_pred": 0.5086206896551724,
        "task": "temporal_order",
        "input": "two adjacent windows -> whether order is correct",
        "split": "chronological",
        "num_samples": 2320,
        "num_train_samples": 1624,
        "num_test_samples": 696,
        "feature_dim": 25134,
        "model": "neural_mlp",
        "head": "z-score -> MLP binary softmax",
        "neural_epochs": 80,
        "neural_hidden_dim": 128,
        "neural_batch_size": 128,
        "neural_learning_rate": 0.001,
        "neural_weight_decay": 0.0001,
        "neural_dropout": 0.1,
        "neural_device": "cpu",
        "train_final_loss": 8.5640803086261e-05,
        "train_final_accuracy": 1.0
      },
      "misalignment_detection": {
        "accuracy": 0.7312138728323699,
        "precision": 0.7272727272727273,
        "recall": 0.7398843930635838,
        "f1": 0.7335243553008597,
        "tp": 256,
        "tn": 250,
        "fp": 96,
        "fn": 90,
        "positive_rate_true": 0.5,
        "positive_rate_pred": 0.5086705202312138,
        "task": "misalignment_detection",
        "input": "motion+visual pair -> aligned vs shifted by 8 windows",
        "split": "chronological",
        "num_samples": 2306,
        "num_train_samples": 1614,
        "num_test_samples": 692,
        "feature_dim": 7343,
        "model": "neural_mlp",
        "head": "z-score -> MLP binary softmax",
        "neural_epochs": 80,
        "neural_hidden_dim": 128,
        "neural_batch_size": 128,
        "neural_learning_rate": 0.001,
        "neural_weight_decay": 0.0001,
        "neural_dropout": 0.1,
        "neural_device": "cpu",
        "train_final_loss": 0.01810159092443583,
        "train_final_accuracy": 0.993184634448575
      }
    }
  },
  "feature_manifest": [
    {
      "name": "hand_left_joints",
      "start": 0,
      "end": 441,
      "dim": 441
    },
    {
      "name": "hand_right_joints",
      "start": 441,
      "end": 882,
      "dim": 441
    },
    {
      "name": "body_joints",
      "start": 882,
      "end": 1974,
      "dim": 1092
    },
    {
      "name": "body_contacts",
      "start": 1974,
      "end": 2121,
      "dim": 147
    },
    {
      "name": "camera_translation",
      "start": 2121,
      "end": 2142,
      "dim": 21
    },
    {
      "name": "camera_rotation_matrix",
      "start": 2142,
      "end": 2205,
      "dim": 63
    },
    {
      "name": "imu_accel_gyro",
      "start": 2205,
      "end": 2247,
      "dim": 42
    },
    {
      "name": "depth_confidence",
      "start": 2247,
      "end": 3227,
      "dim": 980
    },
    {
      "name": "video_fisheye_cam0",
      "start": 3227,
      "end": 3913,
      "dim": 686
    },
    {
      "name": "video_fisheye_cam1",
      "start": 3913,
      "end": 4599,
      "dim": 686
    },
    {
      "name": "video_fisheye_cam2",
      "start": 4599,
      "end": 5285,
      "dim": 686
    },
    {
      "name": "video_fisheye_cam3",
      "start": 5285,
      "end": 5971,
      "dim": 686
    },
    {
      "name": "video_stereo_left",
      "start": 5971,
      "end": 6657,
      "dim": 686
    },
    {
      "name": "video_stereo_right",
      "start": 6657,
      "end": 7343,
      "dim": 686
    },
    {
      "name": "caption_objects_interaction_text",
      "start": 7343,
      "end": 8239,
      "dim": 896
    },
    {
      "name": "slam_point_cloud",
      "start": 8239,
      "end": 8261,
      "dim": 22
    },
    {
      "name": "calibration",
      "start": 8261,
      "end": 8378,
      "dim": 117
    }
  ]
}