{
  "baseline_ranking_note": "Baselines are ranked by raw scores only: coordination ratio descending, then excess cost ascending, then excess VPS ascending.",
  "baseline_raw_rankings": [
    {
      "coordination_ratio_raw_mean": 1.0,
      "excess_cost_raw_mean": 14.644444444444444,
      "excess_vps_raw_mean": 12.4,
      "label": "IMAP",
      "raw_rank": 1,
      "run_id": "baseline-imap-calbench-mixed-001",
      "trace_count": 90
    },
    {
      "coordination_ratio_raw_mean": 0.9955555555555555,
      "excess_cost_raw_mean": 38.36666666666667,
      "excess_vps_raw_mean": 24.301616161616156,
      "label": "DSM-welfare",
      "raw_rank": 2,
      "run_id": "baseline-dsm-welfare-calbench-mixed-001",
      "trace_count": 90
    },
    {
      "coordination_ratio_raw_mean": 0.6266666666666667,
      "excess_cost_raw_mean": 182.82222222222222,
      "excess_vps_raw_mean": 0.10060000000000004,
      "label": "SD-MAP",
      "raw_rank": 3,
      "run_id": "baseline-sd-map-calbench-mixed-001",
      "trace_count": 90
    },
    {
      "coordination_ratio_raw_mean": 0.4733333333333333,
      "excess_cost_raw_mean": 104.04444444444445,
      "excess_vps_raw_mean": 0.0,
      "label": "DSM-private",
      "raw_rank": 4,
      "run_id": "baseline-dsm-private-calbench-mixed-001",
      "trace_count": 90
    }
  ],
  "cache_age_seconds": 0.0,
  "cache_status": "miss",
  "data_source": "dynamodb",
  "dynamodb_table": "calbench-openskill-ratings",
  "generated_at": "2026-06-21T07:53:29Z",
  "job_queue": {
    "active": [],
    "all_counts": {
      "done": 75,
      "failed": 3,
      "pending": 0,
      "running": 0
    },
    "counts": {
      "done": 54,
      "failed": 0,
      "pending": 0,
      "running": 0
    },
    "recent": [
      {
        "calendar_revision": "3d3eed1f7d575f5215f29dd0ebb2151c9ef454a5",
        "created_at": "2026-05-23T04:00:04+00:00",
        "experiments": [
          "experiments/generated/live-matchmaking-20260523T040002Z.yaml"
        ],
        "finished_at": "2026-05-23T04:22:28+00:00",
        "model": "live-matchmaking",
        "repo_revision": "bdc8b30c9426e44e1cc02f580a33f71ad4a58186",
        "run_id": "live-matchmaking-20260523T040002Z",
        "s3_uri": "s3://calbench-traces/calbench-mixed/live-matchmaking/live-matchmaking-20260523T040002Z/",
        "started_at": "2026-05-23T04:05:02+00:00",
        "state": "done",
        "target_count": 7,
        "targets": [
          "claude-sonnet-4-6",
          "deepseek/deepseek-v4-pro",
          "gpt-5.4-mini"
        ],
        "updated_at": "2026-05-23T04:23:19+00:00",
        "worker_progress": {
          "completed": 3,
          "expected": 3
        }
      },
      {
        "calendar_revision": "3d3eed1f7d575f5215f29dd0ebb2151c9ef454a5",
        "created_at": "2026-05-23T03:00:04+00:00",
        "experiments": [
          "experiments/generated/live-matchmaking-20260523T030001Z.yaml"
        ],
        "finished_at": "2026-05-23T03:15:33+00:00",
        "model": "live-matchmaking",
        "repo_revision": "bdc8b30c9426e44e1cc02f580a33f71ad4a58186",
        "run_id": "live-matchmaking-20260523T030001Z",
        "s3_uri": "s3://calbench-traces/calbench-mixed/live-matchmaking/live-matchmaking-20260523T030001Z/",
        "started_at": "2026-05-23T03:05:01+00:00",
        "state": "done",
        "target_count": 7,
        "targets": [
          "claude-sonnet-4-6",
          "deepseek/deepseek-v4-pro",
          "gpt-5.4-mini"
        ],
        "updated_at": "2026-05-23T03:16:25+00:00",
        "worker_progress": {
          "completed": 3,
          "expected": 3
        }
      },
      {
        "calendar_revision": "3d3eed1f7d575f5215f29dd0ebb2151c9ef454a5",
        "created_at": "2026-05-23T02:00:04+00:00",
        "experiments": [
          "experiments/generated/live-matchmaking-20260523T020002Z.yaml"
        ],
        "finished_at": "2026-05-23T02:23:40+00:00",
        "model": "live-matchmaking",
        "repo_revision": "bdc8b30c9426e44e1cc02f580a33f71ad4a58186",
        "run_id": "live-matchmaking-20260523T020002Z",
        "s3_uri": "s3://calbench-traces/calbench-mixed/live-matchmaking/live-matchmaking-20260523T020002Z/",
        "started_at": "2026-05-23T02:05:02+00:00",
        "state": "done",
        "target_count": 7,
        "targets": [
          "claude-sonnet-4-6",
          "deepseek/deepseek-v4-pro",
          "gpt-5.4-mini"
        ],
        "updated_at": "2026-05-23T02:24:33+00:00",
        "worker_progress": {
          "completed": 3,
          "expected": 3
        }
      },
      {
        "calendar_revision": "3d3eed1f7d575f5215f29dd0ebb2151c9ef454a5",
        "created_at": "2026-05-23T01:00:04+00:00",
        "experiments": [
          "experiments/generated/live-matchmaking-20260523T010001Z.yaml"
        ],
        "finished_at": "2026-05-23T01:23:02+00:00",
        "model": "live-matchmaking",
        "repo_revision": "bdc8b30c9426e44e1cc02f580a33f71ad4a58186",
        "run_id": "live-matchmaking-20260523T010001Z",
        "s3_uri": "s3://calbench-traces/calbench-mixed/live-matchmaking/live-matchmaking-20260523T010001Z/",
        "started_at": "2026-05-23T01:05:01+00:00",
        "state": "done",
        "target_count": 7,
        "targets": [
          "claude-sonnet-4-6",
          "deepseek/deepseek-v4-pro",
          "gpt-5.4-mini"
        ],
        "updated_at": "2026-05-23T01:23:53+00:00",
        "worker_progress": {
          "completed": 3,
          "expected": 3
        }
      },
      {
        "calendar_revision": "3d3eed1f7d575f5215f29dd0ebb2151c9ef454a5",
        "created_at": "2026-05-23T00:00:04+00:00",
        "experiments": [
          "experiments/generated/live-matchmaking-20260523T000002Z.yaml"
        ],
        "finished_at": "2026-05-23T00:17:51+00:00",
        "model": "live-matchmaking",
        "repo_revision": "bdc8b30c9426e44e1cc02f580a33f71ad4a58186",
        "run_id": "live-matchmaking-20260523T000002Z",
        "s3_uri": "s3://calbench-traces/calbench-mixed/live-matchmaking/live-matchmaking-20260523T000002Z/",
        "started_at": "2026-05-23T00:05:02+00:00",
        "state": "done",
        "target_count": 7,
        "targets": [
          "claude-sonnet-4-6",
          "deepseek/deepseek-v4-pro",
          "gpt-5.4-mini"
        ],
        "updated_at": "2026-05-23T00:18:42+00:00",
        "worker_progress": {
          "completed": 3,
          "expected": 3
        }
      }
    ],
    "root": "/home/ec2-user/calbench-jobs"
  },
  "leaderboard": [
    {
      "coordination_ratio_mu": 26.581760688802582,
      "coordination_ratio_raw_mean": 0.8669226564299434,
      "coordination_ratio_sigma": 2.4374659872476365,
      "excess_cost_mu": 27.843243992655957,
      "excess_cost_raw_mean": 25.351247600767753,
      "excess_cost_sigma": 2.4083060314180647,
      "excess_vps_mu": 28.928108902011655,
      "excess_vps_raw_mean": 9.363403710812548,
      "excess_vps_sigma": 1.8065896201386658,
      "games_played": 521,
      "mmr": 27.609866898453532,
      "mmr_sigma": 1.3656779439095332,
      "player_id": "publishers/google/models/gemini-3.1-pro-preview",
      "rank_gap_to_next": 1.9287909988796557,
      "rank_gap_z_to_next": 0.9858860451919308
    },
    {
      "coordination_ratio_mu": 26.59577228085082,
      "coordination_ratio_raw_mean": 0.8767507619047619,
      "coordination_ratio_sigma": 2.5122836917899516,
      "excess_cost_mu": 23.401030091165346,
      "excess_cost_raw_mean": 23.19607843137255,
      "excess_cost_sigma": 2.456488475931908,
      "excess_vps_mu": 27.40962582130272,
      "excess_vps_raw_mean": 9.488328664799262,
      "excess_vps_sigma": 1.847729645805651,
      "games_played": 357,
      "mmr": 25.681075899573877,
      "mmr_sigma": 1.4008707161626424,
      "player_id": "qwen/qwen3.6-plus",
      "rank_gap_to_next": 0.3080214037925444,
      "rank_gap_z_to_next": 0.1582948448481517
    },
    {
      "coordination_ratio_mu": 23.17194359786398,
      "coordination_ratio_raw_mean": 0.801703240875914,
      "coordination_ratio_sigma": 2.3729018887148463,
      "excess_cost_mu": 23.205098391255934,
      "excess_cost_raw_mean": 26.613138686131386,
      "excess_cost_sigma": 2.411984516035075,
      "excess_vps_mu": 31.929970478784647,
      "excess_vps_raw_mean": 5.8789537712895354,
      "excess_vps_sigma": 1.8348010301272284,
      "games_played": 548,
      "mmr": 25.373054495781332,
      "mmr_sigma": 1.3505467618522213,
      "player_id": "gpt-5.4-mini",
      "rank_gap_to_next": 0.051913366576862074,
      "rank_gap_z_to_next": 0.027345363655360198
    },
    {
      "coordination_ratio_mu": 25.38352207760709,
      "coordination_ratio_raw_mean": 0.8491921741472188,
      "coordination_ratio_sigma": 2.3775740462834025,
      "excess_cost_mu": 25.73394416406919,
      "excess_cost_raw_mean": 24.28725314183124,
      "excess_cost_sigma": 2.3495045344782977,
      "excess_vps_mu": 24.643407362949663,
      "excess_vps_raw_mean": 9.628366247755825,
      "excess_vps_sigma": 1.786153556855533,
      "games_played": 557,
      "mmr": 25.32114112920447,
      "mmr_sigma": 1.3341944701314437,
      "player_id": "publishers/google/models/gemini-3-flash-preview",
      "rank_gap_to_next": 0.18242956898865614,
      "rank_gap_z_to_next": 0.09407864270951113
    },
    {
      "coordination_ratio_mu": 25.767766605007512,
      "coordination_ratio_raw_mean": 0.880036684065934,
      "coordination_ratio_sigma": 2.500061474412272,
      "excess_cost_mu": 27.573815266352497,
      "excess_cost_raw_mean": 24.035714285714285,
      "excess_cost_sigma": 2.4828963533407564,
      "excess_vps_mu": 20.723078299957734,
      "excess_vps_raw_mean": 16.005494505494507,
      "excess_vps_sigma": 1.8968135657599203,
      "games_played": 364,
      "mmr": 25.138711560215814,
      "mmr_sigma": 1.4071612873132824,
      "player_id": "deepseek/deepseek-v4-pro",
      "rank_gap_to_next": 0.7259363411477295,
      "rank_gap_z_to_next": 0.3735945584279385
    },
    {
      "coordination_ratio_mu": 24.712140776877213,
      "coordination_ratio_raw_mean": 0.8626001848428841,
      "coordination_ratio_sigma": 2.367356871613392,
      "excess_cost_mu": 27.302198695381144,
      "excess_cost_raw_mean": 25.608133086876155,
      "excess_cost_sigma": 2.3657229876692023,
      "excess_vps_mu": 19.88859745973519,
      "excess_vps_raw_mean": 14.598890942698699,
      "excess_vps_sigma": 1.8473529936538968,
      "games_played": 541,
      "mmr": 24.412775219068084,
      "mmr_sigma": 1.3399940972153555,
      "player_id": "claude-sonnet-4-6",
      "rank_gap_to_next": 0.527780863066841,
      "rank_gap_z_to_next": 0.2654191199916867
    },
    {
      "coordination_ratio_mu": 25.420937729905898,
      "coordination_ratio_raw_mean": 0.8295455340909089,
      "coordination_ratio_sigma": 2.606638228846713,
      "excess_cost_mu": 23.667279662546587,
      "excess_cost_raw_mean": 21.664772727272727,
      "excess_cost_sigma": 2.6117653314164366,
      "excess_vps_mu": 21.732285528590314,
      "excess_vps_raw_mean": 10.728219696969704,
      "excess_vps_sigma": 1.9420845131429247,
      "games_played": 352,
      "mmr": 23.884994356001243,
      "mmr_sigma": 1.46917401825587,
      "player_id": "meta/llama-4-maverick-17b-128e-instruct-maas",
      "rank_gap_to_next": null,
      "rank_gap_z_to_next": null
    }
  ],
  "leaderboard_id": "calbench-mixed",
  "metrics": [
    {
      "higher_is_better": true,
      "name": "coordination_ratio",
      "tie_tolerance": 0.0,
      "weight": 0.4
    },
    {
      "higher_is_better": false,
      "name": "excess_cost",
      "tie_tolerance": 0.0,
      "weight": 0.35
    },
    {
      "higher_is_better": false,
      "name": "excess_vps",
      "tie_tolerance": 0.0,
      "weight": 0.25
    }
  ],
  "player_count": 7,
  "trace_count": 648
}