{
  "description": "Measured audio contribution variants over the single public Xperience-10M sample episode.",
  "scope": "single public sample episode; chronological split; ridge heads over fixed feature contracts",
  "raw_audio_metadata": {
    "source": "local_public_sample/fisheye_cam0.mp4",
    "exists": true,
    "has_audio": true,
    "sample_rate": 16000,
    "fps": 20.00137419266181,
    "num_samples": 4656994,
    "num_windows": 1161,
    "feature_dim": 588,
    "mel_bands": 64,
    "fft_size": 512,
    "hop_length": 160,
    "feature_description": "Per-window raw waveform STFT log-mel statistics plus delta and waveform envelope statistics."
  },
  "num_tasks": 12,
  "variants": {
    "all_handcrafted_audio": "All Current Features",
    "all_except_audio": "All Except Audio",
    "handcrafted_audio_only": "Audio Only",
    "raw_logmel_audio_only": "Raw Log-Mel Audio Only",
    "replace_handcrafted_with_raw": "Audio Representation Replacement",
    "all_plus_raw_logmel": "All Current Features + Raw Log-Mel"
  },
  "task_summaries": [
    {
      "task": "timeline_action",
      "task_display": "Current Action Recognition",
      "primary_metric": "macro_f1",
      "higher_is_better": true,
      "all_handcrafted_audio": 0.00905456968081885,
      "all_except_audio": 0.008771929824561405,
      "handcrafted_audio_delta": 0.0002826398562574446,
      "raw_logmel_audio_only": 0.0,
      "replace_handcrafted_with_raw": 0.0013495276653171392,
      "raw_replacement_delta_vs_no_audio": -0.007422402159244265,
      "raw_replacement_delta_vs_handcrafted": -0.00770504201550171,
      "all_plus_raw_logmel": 0.002734107997265892,
      "all_plus_raw_delta_vs_handcrafted": -0.006320461683552957
    },
    {
      "task": "timeline_subtask",
      "task_display": "Current Subtask Recognition",
      "primary_metric": "macro_f1",
      "higher_is_better": true,
      "all_handcrafted_audio": 0.011256354393609296,
      "all_except_audio": 0.0111731843575419,
      "handcrafted_audio_delta": 8.317003606739606e-05,
      "raw_logmel_audio_only": 0.0016722408026755855,
      "replace_handcrafted_with_raw": 0.0008257638315441783,
      "raw_replacement_delta_vs_no_audio": -0.01034742052599772,
      "raw_replacement_delta_vs_handcrafted": -0.010430590562065117,
      "all_plus_raw_logmel": 0.0017889087656529517,
      "all_plus_raw_delta_vs_handcrafted": -0.009467445627956345
    },
    {
      "task": "transition_detection",
      "task_display": "Action Transition Detection",
      "primary_metric": "macro_f1",
      "higher_is_better": true,
      "all_handcrafted_audio": 0.46213292117465227,
      "all_except_audio": 0.46870229007633585,
      "handcrafted_audio_delta": -0.006569368901683581,
      "raw_logmel_audio_only": 0.4637904468412942,
      "replace_handcrafted_with_raw": 0.4792100707180375,
      "raw_replacement_delta_vs_no_audio": 0.010507780641701658,
      "raw_replacement_delta_vs_handcrafted": 0.01707714954338524,
      "all_plus_raw_logmel": 0.4816233470132239,
      "all_plus_raw_delta_vs_handcrafted": 0.019490425838571634
    },
    {
      "task": "next_action",
      "task_display": "Next-Action Prediction",
      "primary_metric": "macro_f1",
      "higher_is_better": true,
      "all_handcrafted_audio": 0.01058201058201058,
      "all_except_audio": 0.010709504685408301,
      "handcrafted_audio_delta": -0.0001274941033977215,
      "raw_logmel_audio_only": 0.0017301038062283738,
      "replace_handcrafted_with_raw": 0.006006006006006006,
      "raw_replacement_delta_vs_no_audio": -0.004703498679402295,
      "raw_replacement_delta_vs_handcrafted": -0.004576004576004574,
      "all_plus_raw_logmel": 0.0058479532163742695,
      "all_plus_raw_delta_vs_handcrafted": -0.00473405736563631
    },
    {
      "task": "hand_trajectory_forecast",
      "task_display": "Future Hand Motion Forecasting",
      "primary_metric": "mae",
      "higher_is_better": false,
      "all_handcrafted_audio": 4.466395378112793,
      "all_except_audio": 4.303755283355713,
      "handcrafted_audio_delta": -0.16264009475708008,
      "raw_logmel_audio_only": 3.1172122955322266,
      "replace_handcrafted_with_raw": 4.305870532989502,
      "raw_replacement_delta_vs_no_audio": -0.0021152496337890625,
      "raw_replacement_delta_vs_handcrafted": 0.16052484512329102,
      "all_plus_raw_logmel": 4.1367621421813965,
      "all_plus_raw_delta_vs_handcrafted": 0.3296332359313965
    },
    {
      "task": "contact_prediction",
      "task_display": "Contact State Prediction",
      "primary_metric": "macro_f1",
      "higher_is_better": true,
      "all_handcrafted_audio": 1.0,
      "all_except_audio": 1.0,
      "handcrafted_audio_delta": 0.0,
      "raw_logmel_audio_only": 1.0,
      "replace_handcrafted_with_raw": 1.0,
      "raw_replacement_delta_vs_no_audio": 0.0,
      "raw_replacement_delta_vs_handcrafted": 0.0,
      "all_plus_raw_logmel": 1.0,
      "all_plus_raw_delta_vs_handcrafted": 0.0
    },
    {
      "task": "object_relevance",
      "task_display": "Relevant Object Prediction",
      "primary_metric": "micro_f1",
      "higher_is_better": true,
      "all_handcrafted_audio": 0.15813953488372093,
      "all_except_audio": 0.14793328498912256,
      "handcrafted_audio_delta": 0.010206249894598368,
      "raw_logmel_audio_only": 0.15894868585732164,
      "replace_handcrafted_with_raw": 0.17871759890859482,
      "raw_replacement_delta_vs_no_audio": 0.030784313919472256,
      "raw_replacement_delta_vs_handcrafted": 0.020578064024873888,
      "all_plus_raw_logmel": 0.18262653898768813,
      "all_plus_raw_delta_vs_handcrafted": 0.024487004103967203
    },
    {
      "task": "caption_grounding",
      "task_display": "Language-to-Time Grounding",
      "primary_metric": "mrr",
      "higher_is_better": true,
      "all_handcrafted_audio": 0.03208567947149277,
      "all_except_audio": 0.027228528633713722,
      "handcrafted_audio_delta": 0.004857150837779045,
      "raw_logmel_audio_only": 0.014815197326242924,
      "replace_handcrafted_with_raw": 0.02484782598912716,
      "raw_replacement_delta_vs_no_audio": -0.002380702644586563,
      "raw_replacement_delta_vs_handcrafted": -0.007237853482365608,
      "all_plus_raw_logmel": 0.02719014883041382,
      "all_plus_raw_delta_vs_handcrafted": -0.004895530641078949
    },
    {
      "task": "cross_modal_retrieval",
      "task_display": "Cross-Modal Window Retrieval",
      "primary_metric": "mrr",
      "higher_is_better": true,
      "all_handcrafted_audio": 0.3751238286495209,
      "all_except_audio": 0.38921058177948,
      "handcrafted_audio_delta": -0.014086753129959106,
      "raw_logmel_audio_only": 0.01806792803108692,
      "replace_handcrafted_with_raw": 0.32749155163764954,
      "raw_replacement_delta_vs_no_audio": -0.061719030141830444,
      "raw_replacement_delta_vs_handcrafted": -0.04763227701187134,
      "all_plus_raw_logmel": 0.31795138120651245,
      "all_plus_raw_delta_vs_handcrafted": -0.05717244744300842
    },
    {
      "task": "modality_reconstruction",
      "task_display": "Sensor-to-Visual Reconstruction",
      "primary_metric": "mae",
      "higher_is_better": false,
      "all_handcrafted_audio": 9.79421329498291,
      "all_except_audio": 10.446661949157715,
      "handcrafted_audio_delta": 0.6524486541748047,
      "raw_logmel_audio_only": 2.6225292682647705,
      "replace_handcrafted_with_raw": 8.830678939819336,
      "raw_replacement_delta_vs_no_audio": 1.615983009338379,
      "raw_replacement_delta_vs_handcrafted": 0.9635343551635742,
      "all_plus_raw_logmel": 8.392388343811035,
      "all_plus_raw_delta_vs_handcrafted": 1.401824951171875
    },
    {
      "task": "temporal_order",
      "task_display": "Temporal Order Verification",
      "primary_metric": "macro_f1",
      "higher_is_better": true,
      "all_handcrafted_audio": 0.5172413793103449,
      "all_except_audio": 0.4942528735632184,
      "handcrafted_audio_delta": 0.022988505747126464,
      "raw_logmel_audio_only": 0.5028735632183908,
      "replace_handcrafted_with_raw": 0.5301714439065678,
      "raw_replacement_delta_vs_no_audio": 0.03591857034334939,
      "raw_replacement_delta_vs_handcrafted": 0.012930064596222923,
      "all_plus_raw_logmel": 0.5330450130569861,
      "all_plus_raw_delta_vs_handcrafted": 0.015803633746641288
    },
    {
      "task": "misalignment_detection",
      "task_display": "Cross-Modal Misalignment Detection",
      "primary_metric": "macro_f1",
      "higher_is_better": true,
      "all_handcrafted_audio": 0.41734045375379186,
      "all_except_audio": 0.42258557365378524,
      "handcrafted_audio_delta": -0.005245119899993378,
      "raw_logmel_audio_only": 0.47823544277887897,
      "replace_handcrafted_with_raw": 0.44378951880827355,
      "raw_replacement_delta_vs_no_audio": 0.021203945154488313,
      "raw_replacement_delta_vs_handcrafted": 0.02644906505448169,
      "all_plus_raw_logmel": 0.4373795761078998,
      "all_plus_raw_delta_vs_handcrafted": 0.02003912235410793
    }
  ],
  "aggregate": {
    "mean_handcrafted_audio_delta": 0.041849794979543296,
    "tasks_where_handcrafted_audio_improves": 6,
    "mean_raw_replacement_delta_vs_handcrafted": 0.09362598132150173,
    "tasks_where_raw_replacement_improves_over_handcrafted": 6
  },
  "provenance": {
    "suite_dir": "results/episode_task_suite",
    "shared_windows": "results/episode_task_suite/shared_windows.npz",
    "feature_manifest": "results/episode_task_suite/feature_manifest.json",
    "audio_source": "local_public_sample/fisheye_cam0.mp4",
    "annotation_source": "local_public_sample/annotation.hdf5",
    "homie_toolkit_available": true
  }
}
