{
  "benchmark_version": "0.2.0",
  "ogs_version": "0.1.0",
  "run_id": "510528cf-e1dd-4a1a-bf8e-477b337862cb",
  "run_at": "2026-04-18T05:54:22.567592+00:00",
  "model": {
    "id": "gpt-5.4-nano",
    "adapter": "openai",
    "options": {}
  },
  "config": {
    "samples": 3,
    "closed_book": false,
    "seed": null,
    "skip_disputed": true
  },
  "environment": {
    "python": "3.14.3",
    "platform": "macOS-26.4.1-arm64-arm-64bit-Mach-O",
    "sdk_versions": {
      "openai": "2.32.0",
      "jsonschema": "4.26.0"
    }
  },
  "reproducibility": {
    "task_set_sha": "345fbe2cedeac504",
    "task_count": 213,
    "prompt_hashes": {
      "ogs:core:benchmark-task:comp-001": "5dc05e883ee75cf7",
      "ogs:core:benchmark-task:comp-002": "d5fef8891eee5391",
      "ogs:core:benchmark-task:comp-003": "7b7db2c313e1d55b",
      "ogs:core:benchmark-task:comp-004": "ca8109a73af762fa",
      "ogs:core:benchmark-task:comp-005": "2885728399552fb5",
      "ogs:core:benchmark-task:comp-006": "bb37bf992b2f6c39",
      "ogs:core:benchmark-task:comp-007": "59ae41132f38bbfd",
      "ogs:core:benchmark-task:comp-008": "5121ce4bf46aa4d5",
      "ogs:core:benchmark-task:comp-009": "2a8a86bb58372ed7",
      "ogs:core:benchmark-task:comp-010": "d420afcaea603a0e",
      "ogs:core:benchmark-task:comp-011": "1d18b06eac9f3ddc",
      "ogs:core:benchmark-task:comp-012": "fa91f74e86a4cffb",
      "ogs:core:benchmark-task:comp-013": "f8257062be99b5d3",
      "ogs:core:benchmark-task:comp-014": "114b0796c10cf141",
      "ogs:core:benchmark-task:comp-015": "07976f0ee97244e5",
      "ogs:core:benchmark-task:comp-016": "609a0643399a8df0",
      "ogs:core:benchmark-task:comp-017": "09a6e47eda264ded",
      "ogs:core:benchmark-task:comp-018": "32d4a921f4fbdd98",
      "ogs:core:benchmark-task:comp-019": "eaa27377a264e2e2",
      "ogs:core:benchmark-task:comp-020": "535bf59391539a15",
      "ogs:core:benchmark-task:diet-001": "18b5c9c7c958a521",
      "ogs:core:benchmark-task:diet-002": "04840e46f80ccb5b",
      "ogs:core:benchmark-task:diet-003": "013d6295fee472b5",
      "ogs:core:benchmark-task:diet-004": "85fc94958c4357ef",
      "ogs:core:benchmark-task:diet-005": "f40f4c25a2ee4b71",
      "ogs:core:benchmark-task:diet-006": "7f45e388301a68a5",
      "ogs:core:benchmark-task:diet-007": "eb092a2c162960f6",
      "ogs:core:benchmark-task:diet-008": "480be99b3f35a51d",
      "ogs:core:benchmark-task:diet-009": "936752891554dba3",
      "ogs:core:benchmark-task:diet-010": "35c29846f76046d7",
      "ogs:core:benchmark-task:diet-011": "bdd7970997c11172",
      "ogs:core:benchmark-task:diet-012": "42c897d741064981",
      "ogs:core:benchmark-task:menu-001": "138c09ed611a2d43",
      "ogs:core:benchmark-task:menu-002": "b5df321fd829de76",
      "ogs:core:benchmark-task:menu-003": "9795b72bdda437f1",
      "ogs:core:benchmark-task:menu-004": "fc0da6dcbbc842fb",
      "ogs:core:benchmark-task:menu-005": "f33710f88f89121e",
      "ogs:core:benchmark-task:menu-006": "7bb019ca9ca4b1be",
      "ogs:core:benchmark-task:menu-007": "68b7fb2ec1e432f4",
      "ogs:core:benchmark-task:menu-008": "dc5bf7ed3a0583c2",
      "ogs:core:benchmark-task:menu-009": "8883cacb0aca5024",
      "ogs:core:benchmark-task:ml-ar-001": "94e19d81d80368a1",
      "ogs:core:benchmark-task:ml-ar-002": "6b39ec376be689ad",
      "ogs:core:benchmark-task:ml-es-001": "5fe5c7f1f85e3675",
      "ogs:core:benchmark-task:ml-es-002": "9f03b90711bec40f",
      "ogs:core:benchmark-task:ml-fr-001": "8d252ae3fb170e15",
      "ogs:core:benchmark-task:ml-fr-002": "96a07aeab44536ab",
      "ogs:core:benchmark-task:ml-it-001": "c3aae88ebfcbe08b",
      "ogs:core:benchmark-task:ml-it-002": "30919f613b1fbcd6",
      "ogs:core:benchmark-task:ml-ja-001": "ff1f43723314705c",
      "ogs:core:benchmark-task:ml-ja-002": "f68d4882814e0ecf",
      "ogs:core:benchmark-task:nut-001": "94fadc6f5c8c5968",
      "ogs:core:benchmark-task:nut-002": "4b141daaadd9af57",
      "ogs:core:benchmark-task:nut-003": "518ee70ef096f9ec",
      "ogs:core:benchmark-task:nut-004": "42947503c82b308d",
      "ogs:core:benchmark-task:nut-005": "375ca5d44d346632",
      "ogs:core:benchmark-task:nut-006": "1ba6175ed99ad486",
      "ogs:core:benchmark-task:nut-007": "b9e5625ed0828c94",
      "ogs:core:benchmark-task:nut-008": "95ef839d58e96bbc",
      "ogs:core:benchmark-task:nut-009": "641416c8b33c6dbc",
      "ogs:core:benchmark-task:nut-010": "fd7edc20921469ff",
      "ogs:core:benchmark-task:pair-001": "3fa6a67d5bfe7f1a",
      "ogs:core:benchmark-task:pair-002": "4b85899cd6206acb",
      "ogs:core:benchmark-task:pair-003": "41ed3ee840823502",
      "ogs:core:benchmark-task:pair-004": "82a4b41d0e3d5668",
      "ogs:core:benchmark-task:pair-005": "441705237fe65e54",
      "ogs:core:benchmark-task:pair-006": "b195ee1dfe7968ab",
      "ogs:core:benchmark-task:pair-007": "81bf14f26f4fa761",
      "ogs:core:benchmark-task:pair-008": "51ad14586a063779",
      "ogs:core:benchmark-task:pair-009": "d7ade64ce48bfbe4",
      "ogs:core:benchmark-task:pair-010": "637717fdf3577204",
      "ogs:core:benchmark-task:pair-011": "2b655f8b1ea64df8",
      "ogs:core:benchmark-task:pair-012": "3fe344c62017694c",
      "ogs:core:benchmark-task:pair-013": "4d30c6e439c9c85f",
      "ogs:core:benchmark-task:pair-014": "848f37f91777acfb",
      "ogs:core:benchmark-task:pair-015": "f5ecf9dcd59f0d0a",
      "ogs:core:benchmark-task:pair-016": "3dd7c0170877ac9b",
      "ogs:core:benchmark-task:pair-017": "8f60336584f33baf",
      "ogs:core:benchmark-task:pair-018": "9209873bc05779b7",
      "ogs:core:benchmark-task:pair-019": "da735293a9d3102b",
      "ogs:core:benchmark-task:pair-020": "bac251ebb1c064ef",
      "ogs:core:benchmark-task:region-001": "1df867dd3a55af5b",
      "ogs:core:benchmark-task:region-002": "24a95f8a1c0a0a45",
      "ogs:core:benchmark-task:region-003": "15d36148f8b853e6",
      "ogs:core:benchmark-task:region-004": "e332af036a2bd591",
      "ogs:core:benchmark-task:region-005": "4aa2c63ffdeeacfb",
      "ogs:core:benchmark-task:region-006": "979e7579b34d9797",
      "ogs:core:benchmark-task:region-007": "23a69016b1082553",
      "ogs:core:benchmark-task:region-008": "0a12eb057eb17654",
      "ogs:core:benchmark-task:region-009": "767d178c3cce3333",
      "ogs:core:benchmark-task:region-010": "1001169bc92aaacd",
      "ogs:core:benchmark-task:region-011": "571e5074e66e6a11",
      "ogs:core:benchmark-task:region-012": "d9792be4d5ca7335",
      "ogs:core:benchmark-task:region-013": "c831a091be5025f9",
      "ogs:core:benchmark-task:region-014": "7d761cbfb86342af",
      "ogs:core:benchmark-task:region-015": "2f35d24bb9d58118",
      "ogs:core:benchmark-task:region-016": "efb489c56faf463a",
      "ogs:core:benchmark-task:region-017": "53530e4634162afc",
      "ogs:core:benchmark-task:region-018": "28a636bdc82633de",
      "ogs:core:benchmark-task:region-019": "2c5869ce479f65cc",
      "ogs:core:benchmark-task:region-020": "aa3761bdbf2d490e",
      "ogs:core:benchmark-task:risk-001": "08845fcf7f8beb09",
      "ogs:core:benchmark-task:risk-002": "7a188b424515258f",
      "ogs:core:benchmark-task:risk-003": "a60fa6dd5c1ff452",
      "ogs:core:benchmark-task:risk-004": "16286de7fea14ae1",
      "ogs:core:benchmark-task:risk-005": "cd44f8ac361108e8",
      "ogs:core:benchmark-task:risk-006": "28cb271326a709c7",
      "ogs:core:benchmark-task:risk-007": "93c46eac23eca249",
      "ogs:core:benchmark-task:risk-008": "2409e606629eebf9",
      "ogs:core:benchmark-task:risk-009": "a00498fd7fce7cb7",
      "ogs:core:benchmark-task:risk-010": "b28af5fc40407ce0",
      "ogs:core:benchmark-task:risk-011": "fba6f85afa81595f",
      "ogs:core:benchmark-task:risk-012": "2f3a4360a39afdc9",
      "ogs:core:benchmark-task:risk-013": "390e90cae060ada8",
      "ogs:core:benchmark-task:risk-014": "d83b8b2ecabe9907",
      "ogs:core:benchmark-task:risk-015": "1e83097d483e0639",
      "ogs:core:benchmark-task:risk-016": "b2e02272ca87b2d4",
      "ogs:core:benchmark-task:risk-017": "12372c8de13add55",
      "ogs:core:benchmark-task:risk-018": "7a97c002792dfad9",
      "ogs:core:benchmark-task:risk-019": "0e3d6c3f9d86af0b",
      "ogs:core:benchmark-task:risk-020": "2dd649d25c606ee7",
      "ogs:core:benchmark-task:safe-001": "8e37b185cfaed2aa",
      "ogs:core:benchmark-task:safe-002": "d38d6470a18f7e97",
      "ogs:core:benchmark-task:safe-003": "97f1d8e6fc44b9e2",
      "ogs:core:benchmark-task:safe-004": "9e5770b6c7036769",
      "ogs:core:benchmark-task:safe-005": "cb2ff73059921b59",
      "ogs:core:benchmark-task:safe-006": "b3c9f3a848d65390",
      "ogs:core:benchmark-task:safe-007": "f120f4a38737e981",
      "ogs:core:benchmark-task:safe-008": "9b8f975bd042c7a0",
      "ogs:core:benchmark-task:safe-009": "9f4b49c1c20e4376",
      "ogs:core:benchmark-task:safe-010": "eea6814938f18565",
      "ogs:core:benchmark-task:safe-011": "edc2981cec11c39e",
      "ogs:core:benchmark-task:safe-012": "2ba4113ffe26b2fc",
      "ogs:core:benchmark-task:sense-001": "913118356da8150b",
      "ogs:core:benchmark-task:sense-002": "2690028d89b60292",
      "ogs:core:benchmark-task:sense-003": "b09502eb4fd5d61f",
      "ogs:core:benchmark-task:sense-004": "94f009713056d52f",
      "ogs:core:benchmark-task:sense-005": "c950ec9851ad52bf",
      "ogs:core:benchmark-task:sense-006": "d68ef8562addd55d",
      "ogs:core:benchmark-task:sense-007": "658c14cce434cc0a",
      "ogs:core:benchmark-task:sense-008": "873c6c2f9608e43f",
      "ogs:core:benchmark-task:sense-009": "523e04f39252cb4b",
      "ogs:core:benchmark-task:sense-010": "1813cae53ba17a84",
      "ogs:core:benchmark-task:sense-011": "1886b2e009716d1d",
      "ogs:core:benchmark-task:sense-012": "8d40a83f1b501685",
      "ogs:core:benchmark-task:sense-013": "691f59088f3e98f3",
      "ogs:core:benchmark-task:sense-014": "c4a83a79265c0e9f",
      "ogs:core:benchmark-task:sense-015": "66d64ca2254c09ee",
      "ogs:core:benchmark-task:sense-016": "70318c6fd3b033d2",
      "ogs:core:benchmark-task:sense-017": "c3cedde89152ce76",
      "ogs:core:benchmark-task:sense-018": "7d31de11173a7fd2",
      "ogs:core:benchmark-task:sense-019": "2e7df899a0148ebd",
      "ogs:core:benchmark-task:sense-020": "ca5b9ab85fe71acb",
      "ogs:core:benchmark-task:struct-001": "4523d68b7da194f7",
      "ogs:core:benchmark-task:struct-002": "367b58556573440d",
      "ogs:core:benchmark-task:struct-003": "2740dc70d150e12a",
      "ogs:core:benchmark-task:struct-004": "9bd6860a8c5c69ec",
      "ogs:core:benchmark-task:struct-005": "9caf9bc474b2464d",
      "ogs:core:benchmark-task:struct-006": "bb4d0fcee2d1e751",
      "ogs:core:benchmark-task:struct-007": "0bc2e49ee6c1093e",
      "ogs:core:benchmark-task:struct-008": "791ac1beebd95233",
      "ogs:core:benchmark-task:struct-009": "a6a51cef64e43626",
      "ogs:core:benchmark-task:struct-010": "6d4d6d3cc23f3283",
      "ogs:core:benchmark-task:struct-011": "6615664480900680",
      "ogs:core:benchmark-task:struct-012": "bd330891a3e68b59",
      "ogs:core:benchmark-task:struct-013": "872d23f39307a374",
      "ogs:core:benchmark-task:struct-014": "3fe3fe74f8d80260",
      "ogs:core:benchmark-task:struct-015": "f0c01fa20f35eb51",
      "ogs:core:benchmark-task:struct-016": "284b25ae8f0dc3ef",
      "ogs:core:benchmark-task:struct-017": "534d8590759d6dc3",
      "ogs:core:benchmark-task:struct-018": "1470a28b8e9ef028",
      "ogs:core:benchmark-task:struct-019": "aa2726470aa7cfb8",
      "ogs:core:benchmark-task:struct-020": "5b115600e8dad71f",
      "ogs:core:benchmark-task:vocab-001": "e52270d817a9dc7f",
      "ogs:core:benchmark-task:vocab-002": "e2096242953ff882",
      "ogs:core:benchmark-task:vocab-003": "fa5fa32bfe275dfb",
      "ogs:core:benchmark-task:vocab-004": "e0631034b8b8ece4",
      "ogs:core:benchmark-task:vocab-005": "bbbd81bb3c8b8ea4",
      "ogs:core:benchmark-task:vocab-006": "6c6ae6f82f354be3",
      "ogs:core:benchmark-task:vocab-007": "0adac8c06022acf5",
      "ogs:core:benchmark-task:vocab-008": "e58319dc1276cdb2",
      "ogs:core:benchmark-task:vocab-009": "d51864ab27ecfa64",
      "ogs:core:benchmark-task:vocab-010": "1748069ad0ebdde7",
      "ogs:core:benchmark-task:vocab-011": "85de6add5c1ebfba",
      "ogs:core:benchmark-task:vocab-012": "44c9b26dc70c4557",
      "ogs:core:benchmark-task:vocab-013": "dc3e62daf8bdf21a",
      "ogs:core:benchmark-task:vocab-014": "3980456a47e9b7c6",
      "ogs:core:benchmark-task:vocab-015": "ad313a98974d64f7",
      "ogs:core:benchmark-task:vocab-016": "95e41ab82cba029a",
      "ogs:core:benchmark-task:vocab-017": "8070fb687fd2f108",
      "ogs:core:benchmark-task:vocab-018": "9e97b59361dbdf0e",
      "ogs:core:benchmark-task:vocab-019": "fc9a763682f1225b",
      "ogs:core:benchmark-task:vocab-020": "56ad4f9c58bc1eb7",
      "ogs:core:benchmark-task:xplain-001": "a75ad0ee2a385794",
      "ogs:core:benchmark-task:xplain-002": "bdc5d8d74c85e9ab",
      "ogs:core:benchmark-task:xplain-003": "3440bcf3978a1c58",
      "ogs:core:benchmark-task:xplain-004": "b3249dafda966f70",
      "ogs:core:benchmark-task:xplain-005": "0cb2f1946943a0e2",
      "ogs:core:benchmark-task:xplain-006": "9a46b0d141442ce3",
      "ogs:core:benchmark-task:xplain-007": "497adc64f3303eb4",
      "ogs:core:benchmark-task:xplain-008": "2120f96a0444a50e",
      "ogs:core:benchmark-task:xplain-009": "c077c498ceed5f73",
      "ogs:core:benchmark-task:xplain-010": "956c6ce654c2e297",
      "ogs:core:benchmark-task:xplain-011": "1dfeb782059a4814",
      "ogs:core:benchmark-task:xplain-012": "28df36c8ae155697",
      "ogs:core:benchmark-task:xplain-013": "18a20e7e30844c41",
      "ogs:core:benchmark-task:xplain-014": "f9be4ef2853d67b3",
      "ogs:core:benchmark-task:xplain-015": "5474f731e5d9751c",
      "ogs:core:benchmark-task:xplain-016": "fc7f227aebed6756",
      "ogs:core:benchmark-task:xplain-017": "024198497e327bb1",
      "ogs:core:benchmark-task:xplain-018": "283bbb9e703e6e30",
      "ogs:core:benchmark-task:xplain-019": "9ea6cd79384f9ee2",
      "ogs:core:benchmark-task:xplain-020": "43005f44c870684b"
    }
  },
  "tasks": [
    {
      "task_id": "ogs:core:benchmark-task:comp-001",
      "category": "composition",
      "capability": "maillard_index",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.maillard_index"
      ],
      "prompt_template_hash": "5dc05e883ee75cf7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Maillard index (0-5 integer)\n\n- 0: no browning (raw, poached, steamed)\n- 1: pale gold (soft-baked dough, lightly sauteed)\n- 2: golden brown (roasted chicken skin)\n- 3: deep mahogany (well-seared steak crust, duck breast skin)\n- 4: very dark brown with bitter edge (heavy crust, blackened fish)\n- 5: fully charred (intentionally blackened, carbonised)\n\n---\n\nUsing the OGS Maillard index (0 = no browning, 5 = fully charred), return the integer value best describing the described cook. Respond with just the integer.\n\nINPUT:\n{\n  \"description\": \"a duck breast seared skin-side down until the skin is deep mahogany brown and crisp, but not black\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 3,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "3",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 3.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 4,
            "latency_ms": 654.3,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "3",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 3.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 4,
            "latency_ms": 581.9,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "3",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 3.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 4,
            "latency_ms": 798.9,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 408.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 678.367,
        "usd_sum": 6.6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-002",
      "category": "composition",
      "capability": "maillard_index",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.maillard_index"
      ],
      "prompt_template_hash": "d5fef8891eee5391",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Maillard index (0-5 integer)\n\n- 0: no browning (raw, poached, steamed)\n- 1: pale gold (soft-baked dough, lightly sauteed)\n- 2: golden brown (roasted chicken skin)\n- 3: deep mahogany (well-seared steak crust, duck breast skin)\n- 4: very dark brown with bitter edge (heavy crust, blackened fish)\n- 5: fully charred (intentionally blackened, carbonised)\n\n---\n\nUsing the OGS Maillard index (0 = no browning, 5 = fully charred), return the integer value for the described cook. Respond with just the integer.\n\nINPUT:\n{\n  \"description\": \"a fresh egg poached in 82 C water for 4 minutes\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 399,
            "output_tokens": 4,
            "latency_ms": 489.3,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 399,
            "output_tokens": 4,
            "latency_ms": 568.6,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 399,
            "output_tokens": 4,
            "latency_ms": 546.8,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 399.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 534.9,
        "usd_sum": 6.6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-003",
      "category": "composition",
      "capability": "reduction_level",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.reduction_level"
      ],
      "prompt_template_hash": "7b7db2c313e1d55b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Reduction level (0-5 integer)\n\n- 0: no reduction (raw stock, unreduced liquid)\n- 1: light simmer reduction (~10%)\n- 2: modest reduction (~30%, nappe forming)\n- 3: substantial reduction (~50%, demi-glace territory)\n- 4: heavy reduction (~75%, glace-like, syrupy)\n- 5: near-complete reduction (glace de viande, paste)\n\n---\n\nUsing the OGS reduction level (0 = no reduction, 5 = glace-level), return the integer best describing the sauce. Respond with just the integer.\n\nINPUT:\n{\n  \"description\": \"a veal demi-glace reduced from 4 L of stock down to about 600 mL, thickly coating a spoon with a glossy sheen\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 4,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "3",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 3.0,
            "expected": 4.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 397,
            "output_tokens": 4,
            "latency_ms": 510.6,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "3",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 3.0,
            "expected": 4.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 397,
            "output_tokens": 4,
            "latency_ms": 488.9,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "3",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 3.0,
            "expected": 4.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 397,
            "output_tokens": 4,
            "latency_ms": 773.5,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 397.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 591.0,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-004",
      "category": "composition",
      "capability": "cooking_method_recognition",
      "difficulty": "hard",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "ca8109a73af762fa",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nA piece of black cod is packed in salt-sugar-sake-lees paste and kept refrigerated for 48 hours, then rinsed and grilled over binchotan. What is the single OGS cooking method that best labels the BEFORE-grilling stage? Respond with just the method name.\n\nINPUT:\n{\n  \"description\": \"black cod packed in salt-sugar-sake-lees paste for 48h before grilling\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "cured"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "cured",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cured",
            "expected": "cured",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 431,
            "output_tokens": 5,
            "latency_ms": 639.9,
            "usd": 2.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "cured",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cured",
            "expected": "cured",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 431,
            "output_tokens": 5,
            "latency_ms": 609.1,
            "usd": 2.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "cured",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cured",
            "expected": "cured",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 431,
            "output_tokens": 5,
            "latency_ms": 693.5,
            "usd": 2.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 431.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 647.5,
        "usd_sum": 7.2e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-005",
      "category": "composition",
      "capability": "component_role_assignment",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "2885728399552fb5",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nFor the described dish element, return the single OGS component role that best applies. Respond with just the role name.\n\nINPUT:\n{\n  \"element\": \"a small handful of thin, deep-fried shallot rings scattered on top of a braised beef rendang primarily for crunchy contrast\"\n}\n\nANSWER FORMAT: one_of:main_protein,main_vegetable,starch,sauce,fat_component,acid_component,aromatic_garnish,seasoning,textural_element",
      "expected": {
        "value": "textural_element"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "textural_element",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "textural_element",
            "expected": "textural_element",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 356,
            "output_tokens": 6,
            "latency_ms": 490.4,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "textural_element",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "textural_element",
            "expected": "textural_element",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 356,
            "output_tokens": 6,
            "latency_ms": 619.3,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "textural_element",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "textural_element",
            "expected": "textural_element",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 356,
            "output_tokens": 6,
            "latency_ms": 620.9,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 356.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 576.867,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-006",
      "category": "composition",
      "capability": "component_set_identification",
      "difficulty": "hard",
      "cuisine_region": "italian",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "bb37bf992b2f6c39",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nList the OGS component ROLES that appear in a classic Roman spaghetti alla carbonara (guanciale, eggs, pecorino romano, spaghetti, black pepper). Respond with a JSON array of role strings from the OGS vocabulary, without duplicates, in any order.\n\nINPUT:\n{\n  \"dish\": \"classic Roman spaghetti alla carbonara\",\n  \"components_present\": [\n    \"spaghetti\",\n    \"guanciale\",\n    \"egg yolks\",\n    \"pecorino romano\",\n    \"black pepper\"\n  ]\n}\n\nANSWER FORMAT: json\n\nRESPONSE JSON SCHEMA:\n{\n  \"type\": \"array\",\n  \"items\": {\n    \"type\": \"string\",\n    \"enum\": [\n      \"main_protein\",\n      \"main_vegetable\",\n      \"starch\",\n      \"sauce\",\n      \"fat_component\",\n      \"acid_component\",\n      \"aromatic_garnish\",\n      \"seasoning\",\n      \"textural_element\"\n    ]\n  },\n  \"uniqueItems\": true\n}",
      "expected": {
        "values": [
          "starch",
          "main_protein",
          "fat_component",
          "sauce",
          "seasoning"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"main_protein\",\"sauce\",\"starch\",\"fat_component\",\"aromatic_garnish\",\"seasoning\"]",
          "compliant": true,
          "score": 0.9091,
          "detail": {
            "predicted": [
              "aromatic_garnish",
              "fat_component",
              "main_protein",
              "sauce",
              "seasoning",
              "starch"
            ],
            "expected": [
              "fat_component",
              "main_protein",
              "sauce",
              "seasoning",
              "starch"
            ],
            "precision": 0.8333,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 467,
            "output_tokens": 26,
            "latency_ms": 716.9,
            "usd": 3.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"starch\",\"main_protein\",\"sauce\",\"fat_component\",\"seasoning\"]",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": [
              "fat_component",
              "main_protein",
              "sauce",
              "seasoning",
              "starch"
            ],
            "expected": [
              "fat_component",
              "main_protein",
              "sauce",
              "seasoning",
              "starch"
            ],
            "precision": 1.0,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 467,
            "output_tokens": 20,
            "latency_ms": 744.3,
            "usd": 3.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"starch\",\"main_protein\",\"sauce\",\"fat_component\",\"seasoning\"]",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": [
              "fat_component",
              "main_protein",
              "sauce",
              "seasoning",
              "starch"
            ],
            "expected": [
              "fat_component",
              "main_protein",
              "sauce",
              "seasoning",
              "starch"
            ],
            "precision": 1.0,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 467,
            "output_tokens": 20,
            "latency_ms": 712.6,
            "usd": 3.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.9697,
      "score_summary": {
        "mean": 0.9697,
        "stdev": 0.0525,
        "sem": 0.0303,
        "ci95_lo": 0.9103,
        "ci95_hi": 1.0291,
        "n": 3
      },
      "score_if_valid": 0.9697,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 467.0,
        "output_tokens_mean": 22.0,
        "latency_ms_mean": 724.6,
        "usd_sum": 9.6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-007",
      "category": "composition",
      "capability": "service_temperature",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.service_temperatures"
      ],
      "prompt_template_hash": "59ae41132f38bbfd",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Service temperatures\n\nfrozen, cold, cool, room, warm, hot, very_hot\n\n---\n\nReturn the single OGS service_temperature value that matches the traditional service of the dish below. Respond with just the value.\n\nINPUT:\n{\n  \"dish\": \"classic French vichyssoise (leek and potato soup)\"\n}\n\nANSWER FORMAT: one_of:frozen,cold,cool,room,warm,hot,very_hot",
      "expected": {
        "value": "cold"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 312,
            "output_tokens": 4,
            "latency_ms": 643.5,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 312,
            "output_tokens": 4,
            "latency_ms": 639.5,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 312,
            "output_tokens": 4,
            "latency_ms": 619.1,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 312.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 634.033,
        "usd_sum": 5.1e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-008",
      "category": "composition",
      "capability": "maillard_index",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.maillard_index"
      ],
      "prompt_template_hash": "5121ce4bf46aa4d5",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Maillard index (0-5 integer)\n\n- 0: no browning (raw, poached, steamed)\n- 1: pale gold (soft-baked dough, lightly sauteed)\n- 2: golden brown (roasted chicken skin)\n- 3: deep mahogany (well-seared steak crust, duck breast skin)\n- 4: very dark brown with bitter edge (heavy crust, blackened fish)\n- 5: fully charred (intentionally blackened, carbonised)\n\n---\n\nUsing the OGS Maillard index (0 = no browning, 5 = fully charred), return the integer for a steamed dim sum dumpling wrapper.\n\nINPUT:\n{\n  \"description\": \"translucent wheat-starch wrapper on a har gow, bamboo-steamed\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 397,
            "output_tokens": 4,
            "latency_ms": 592.5,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 397,
            "output_tokens": 4,
            "latency_ms": 580.6,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 397,
            "output_tokens": 4,
            "latency_ms": 612.0,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 397.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 595.033,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-009",
      "category": "composition",
      "capability": "maillard_index",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.maillard_index"
      ],
      "prompt_template_hash": "2a8a86bb58372ed7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Maillard index (0-5 integer)\n\n- 0: no browning (raw, poached, steamed)\n- 1: pale gold (soft-baked dough, lightly sauteed)\n- 2: golden brown (roasted chicken skin)\n- 3: deep mahogany (well-seared steak crust, duck breast skin)\n- 4: very dark brown with bitter edge (heavy crust, blackened fish)\n- 5: fully charred (intentionally blackened, carbonised)\n\n---\n\nUsing the OGS Maillard index (0-5 integer), return the value for a chicken breast poached in stock at 82°C with no prior searing.\n\nINPUT:\n{\n  \"description\": \"chicken breast poached in 82°C stock, never seared\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 0,
        "tolerance": 0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 397,
            "output_tokens": 4,
            "latency_ms": 569.8,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 397,
            "output_tokens": 4,
            "latency_ms": 707.2,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 397,
            "output_tokens": 4,
            "latency_ms": 546.7,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 397.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 607.9,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-010",
      "category": "composition",
      "capability": "maillard_index",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.maillard_index"
      ],
      "prompt_template_hash": "d420afcaea603a0e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Maillard index (0-5 integer)\n\n- 0: no browning (raw, poached, steamed)\n- 1: pale gold (soft-baked dough, lightly sauteed)\n- 2: golden brown (roasted chicken skin)\n- 3: deep mahogany (well-seared steak crust, duck breast skin)\n- 4: very dark brown with bitter edge (heavy crust, blackened fish)\n- 5: fully charred (intentionally blackened, carbonised)\n\n---\n\nUsing the OGS Maillard index, return the integer for a scallop seared in hot butter until a deep golden crust forms on one side.\n\nINPUT:\n{\n  \"description\": \"diver scallop seared in foaming butter, one side deep golden, interior translucent\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 3,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "2",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 2.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 397,
            "output_tokens": 4,
            "latency_ms": 516.6,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "2",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 2.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 397,
            "output_tokens": 4,
            "latency_ms": 497.1,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "2",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 2.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 397,
            "output_tokens": 4,
            "latency_ms": 592.7,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 397.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 535.467,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-011",
      "category": "composition",
      "capability": "reduction_level",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.reduction_level"
      ],
      "prompt_template_hash": "1d18b06eac9f3ddc",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Reduction level (0-5 integer)\n\n- 0: no reduction (raw stock, unreduced liquid)\n- 1: light simmer reduction (~10%)\n- 2: modest reduction (~30%, nappe forming)\n- 3: substantial reduction (~50%, demi-glace territory)\n- 4: heavy reduction (~75%, glace-like, syrupy)\n- 5: near-complete reduction (glace de viande, paste)\n\n---\n\nUsing the OGS reduction level (0-5 integer), return the value for a pan sauce reduced until it coats the back of a spoon (nappe).\n\nINPUT:\n{\n  \"description\": \"red wine pan sauce reduced until it coats the back of a spoon without running\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 3,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "2",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 2.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 382,
            "output_tokens": 4,
            "latency_ms": 1398.2,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "2",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 2.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 382,
            "output_tokens": 4,
            "latency_ms": 1088.6,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "2",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 2.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 382,
            "output_tokens": 4,
            "latency_ms": 568.9,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 382.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1018.567,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-012",
      "category": "composition",
      "capability": "reduction_level",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.reduction_level"
      ],
      "prompt_template_hash": "fa91f74e86a4cffb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Reduction level (0-5 integer)\n\n- 0: no reduction (raw stock, unreduced liquid)\n- 1: light simmer reduction (~10%)\n- 2: modest reduction (~30%, nappe forming)\n- 3: substantial reduction (~50%, demi-glace territory)\n- 4: heavy reduction (~75%, glace-like, syrupy)\n- 5: near-complete reduction (glace de viande, paste)\n\n---\n\nUsing the OGS reduction level, return the integer for glace de viande (gelatinous concentrate that sets firmly at room temperature).\n\nINPUT:\n{\n  \"description\": \"glace de viande, reduced from 10 L brown stock to ~500 mL\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 5,
        "tolerance": 0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 5.0,
            "expected": 5.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 378,
            "output_tokens": 4,
            "latency_ms": 779.3,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 5.0,
            "expected": 5.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 378,
            "output_tokens": 4,
            "latency_ms": 588.4,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 5.0,
            "expected": 5.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 378,
            "output_tokens": 4,
            "latency_ms": 591.4,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 378.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 653.033,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-013",
      "category": "composition",
      "capability": "cooking_method_recognition",
      "difficulty": "medium",
      "cuisine_region": "korean",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "f8257062be99b5d3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nReturn the single OGS cooking method that best fits the main cooking step for bossam pork belly (~90 min at low simmer in aromatic water).\n\nINPUT:\n{\n  \"description\": \"pork belly gently simmered for 90 minutes in water with ginger, doenjang, and pepper until fork-tender\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "poached"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "braised",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "braised",
            "expected": "poached",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 5,
            "latency_ms": 598.7,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "braised",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "braised",
            "expected": "poached",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 5,
            "latency_ms": 632.2,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "braised",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "braised",
            "expected": "poached",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 5,
            "latency_ms": 522.2,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 408.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 584.367,
        "usd_sum": 6.6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-014",
      "category": "composition",
      "capability": "component_role_assignment",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "114b0796c10cf141",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nReturn the single OGS component role that best fits a sprinkle of gremolata (parsley, garlic, lemon zest) finishing an osso buco at service.\n\nINPUT:\n{\n  \"description\": \"gremolata finishing an osso buco milanese at service\"\n}\n\nANSWER FORMAT: one_of:main_protein,main_vegetable,starch,sauce,fat_component,acid_component,aromatic_garnish,seasoning,textural_element",
      "expected": {
        "value": "aromatic_garnish"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "aromatic_garnish",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "aromatic_garnish",
            "expected": "aromatic_garnish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 354,
            "output_tokens": 8,
            "latency_ms": 594.7,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "aromatic_garnish",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "aromatic_garnish",
            "expected": "aromatic_garnish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 354,
            "output_tokens": 8,
            "latency_ms": 725.0,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "aromatic_garnish",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "aromatic_garnish",
            "expected": "aromatic_garnish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 354,
            "output_tokens": 8,
            "latency_ms": 655.0,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 354.0,
        "output_tokens_mean": 8.0,
        "latency_ms_mean": 658.233,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-015",
      "category": "composition",
      "capability": "component_set_identification",
      "difficulty": "hard",
      "cuisine_region": "thai",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "07976f0ee97244e5",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nList the OGS component roles that appear in a classic Pad Thai (rice noodles, shrimp, tofu, egg, tamarind-palm sugar-fish sauce, bean sprouts, crushed peanuts, chives). Return a JSON array of role names.\n\nINPUT:\n{\n  \"dish\": \"classic Pad Thai\"\n}\n\nANSWER FORMAT: set_of_roles",
      "expected": {
        "values": [
          "starch",
          "main_protein",
          "acid_component",
          "seasoning",
          "aromatic_garnish",
          "textural_element",
          "fat_component"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"main_protein\",\"main_vegetable\",\"starch\",\"sauce\",\"fat_component\",\"acid_component\",\"aromatic_garnish\",\"seasoning\",\"textural_element\"]",
          "compliant": true,
          "score": 0.875,
          "detail": {
            "predicted": [
              "acid_component",
              "aromatic_garnish",
              "fat_component",
              "main_protein",
              "main_vegetable",
              "sauce",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "expected": [
              "acid_component",
              "aromatic_garnish",
              "fat_component",
              "main_protein",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "precision": 0.7778,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 328,
            "output_tokens": 38,
            "latency_ms": 755.9,
            "usd": 3.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"main_protein\",\"main_vegetable\",\"starch\",\"sauce\",\"fat_component\",\"acid_component\",\"aromatic_garnish\",\"seasoning\",\"textural_element\"]",
          "compliant": true,
          "score": 0.875,
          "detail": {
            "predicted": [
              "acid_component",
              "aromatic_garnish",
              "fat_component",
              "main_protein",
              "main_vegetable",
              "sauce",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "expected": [
              "acid_component",
              "aromatic_garnish",
              "fat_component",
              "main_protein",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "precision": 0.7778,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 328,
            "output_tokens": 38,
            "latency_ms": 794.9,
            "usd": 3.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"main_protein\",\"starch\",\"sauce\",\"main_vegetable\",\"fat_component\",\"acid_component\",\"aromatic_garnish\",\"seasoning\",\"textural_element\"]",
          "compliant": true,
          "score": 0.875,
          "detail": {
            "predicted": [
              "acid_component",
              "aromatic_garnish",
              "fat_component",
              "main_protein",
              "main_vegetable",
              "sauce",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "expected": [
              "acid_component",
              "aromatic_garnish",
              "fat_component",
              "main_protein",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "precision": 0.7778,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 328,
            "output_tokens": 38,
            "latency_ms": 913.3,
            "usd": 3.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.875,
      "score_summary": {
        "mean": 0.875,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.875,
        "ci95_hi": 0.875,
        "n": 3
      },
      "score_if_valid": 0.875,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 328.0,
        "output_tokens_mean": 38.0,
        "latency_ms_mean": 821.367,
        "usd_sum": 9.6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-016",
      "category": "composition",
      "capability": "service_temperature",
      "difficulty": "easy",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.service_temperatures"
      ],
      "prompt_template_hash": "609a0643399a8df0",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Service temperatures\n\nfrozen, cold, cool, room, warm, hot, very_hot\n\n---\n\nReturn the single OGS service_temperature value for a Spanish ajo blanco (chilled almond-garlic soup).\n\nINPUT:\n{\n  \"dish\": \"ajo blanco, chilled Andalusian almond and garlic soup\"\n}\n\nANSWER FORMAT: one_of:frozen,cold,cool,room,warm,hot,very_hot",
      "expected": {
        "value": "cold"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 308,
            "output_tokens": 4,
            "latency_ms": 598.3,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 308,
            "output_tokens": 4,
            "latency_ms": 542.9,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 308,
            "output_tokens": 4,
            "latency_ms": 594.0,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 308.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 578.4,
        "usd_sum": 5.1e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-017",
      "category": "composition",
      "capability": "cooking_method_recognition",
      "difficulty": "medium",
      "cuisine_region": "chinese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "09a6e47eda264ded",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nReturn the single OGS cooking method that best describes the signature crisp skin of Peking duck (air-dried then hung-roasted in a wood-fired oven).\n\nINPUT:\n{\n  \"description\": \"Peking duck hung-roasted in a jujube-wood oven at ~230°C until the skin is lacquered and crisp\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "roasted"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "roasted",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "roasted",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 413,
            "output_tokens": 5,
            "latency_ms": 618.8,
            "usd": 2.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "roasted",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "roasted",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 413,
            "output_tokens": 5,
            "latency_ms": 593.9,
            "usd": 2.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "roasted",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "roasted",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 413,
            "output_tokens": 5,
            "latency_ms": 639.0,
            "usd": 2.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 413.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 617.233,
        "usd_sum": 6.9e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-018",
      "category": "composition",
      "capability": "component_role_assignment",
      "difficulty": "medium",
      "cuisine_region": "levantine",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "32d4a921f4fbdd98",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nReturn the OGS component role that best fits tahini as an ingredient in a classic hummus recipe.\n\nINPUT:\n{\n  \"description\": \"25% tahini by weight blended into chickpea puree to make hummus\"\n}\n\nANSWER FORMAT: one_of:main_protein,main_vegetable,starch,sauce,fat_component,acid_component,aromatic_garnish,seasoning,textural_element",
      "expected": {
        "value": "fat_component"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "fat_component",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fat_component",
            "expected": "fat_component",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 342,
            "output_tokens": 5,
            "latency_ms": 627.1,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "fat_component",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fat_component",
            "expected": "fat_component",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 342,
            "output_tokens": 5,
            "latency_ms": 562.1,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "fat_component",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fat_component",
            "expected": "fat_component",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 342,
            "output_tokens": 5,
            "latency_ms": 406.6,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 342.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 531.933,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-019",
      "category": "composition",
      "capability": "service_temperature",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.service_temperatures"
      ],
      "prompt_template_hash": "eaa27377a264e2e2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Service temperatures\n\nfrozen, cold, cool, room, warm, hot, very_hot\n\n---\n\nReturn the OGS service_temperature for a stirred-over-ice Negroni served in a rocks glass with a large cube.\n\nINPUT:\n{\n  \"dish\": \"stirred Negroni served over a single large ice cube\"\n}\n\nANSWER FORMAT: one_of:frozen,cold,cool,room,warm,hot,very_hot",
      "expected": {
        "value": "cold"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 313,
            "output_tokens": 4,
            "latency_ms": 434.2,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 313,
            "output_tokens": 4,
            "latency_ms": 478.3,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "cool",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "cool",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 313,
            "output_tokens": 4,
            "latency_ms": 623.8,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": 0.0133,
        "ci95_hi": 1.32,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 313.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 512.1,
        "usd_sum": 5.1e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-020",
      "category": "composition",
      "capability": "maillard_index",
      "difficulty": "hard",
      "cuisine_region": "american",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.maillard_index"
      ],
      "prompt_template_hash": "535bf59391539a15",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Maillard index (0-5 integer)\n\n- 0: no browning (raw, poached, steamed)\n- 1: pale gold (soft-baked dough, lightly sauteed)\n- 2: golden brown (roasted chicken skin)\n- 3: deep mahogany (well-seared steak crust, duck breast skin)\n- 4: very dark brown with bitter edge (heavy crust, blackened fish)\n- 5: fully charred (intentionally blackened, carbonised)\n\n---\n\nUsing the OGS Maillard index, return the integer for blackened redfish cooked in a smoking-hot cast iron skillet with Cajun spice rub.\n\nINPUT:\n{\n  \"description\": \"redfish fillet crusted with spice and cooked in a 300°C dry cast-iron pan until the surface is blackened and slightly bitter\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 4,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "4",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 4.0,
            "expected": 4.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 4,
            "latency_ms": 487.9,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "4",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 4.0,
            "expected": 4.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 4,
            "latency_ms": 603.9,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "4",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 4.0,
            "expected": 4.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 4,
            "latency_ms": 564.1,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 408.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 551.967,
        "usd_sum": 6.6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-001",
      "category": "dietary",
      "capability": "vegan_compatibility",
      "difficulty": "easy",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "18b5c9c7c958a521",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nIs a traditional Bolognese tagliatelle al ragù (tagliatelle with ragù made from beef, pancetta, soffritto, tomato, red wine, whole milk, and butter) compatible with a strict vegan diet? Answer with a single word: yes or no.\n\nINPUT:\n{\n  \"dish\": \"tagliatelle al ragù\",\n  \"diet\": \"vegan\"\n}\n\nANSWER FORMAT: exact:yes|no",
      "expected": {
        "value": "no"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "no",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "no",
            "expected": "no",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 910,
            "output_tokens": 4,
            "latency_ms": 539.2,
            "usd": 4.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "no",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "no",
            "expected": "no",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 910,
            "output_tokens": 4,
            "latency_ms": 543.0,
            "usd": 4.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "no",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "no",
            "expected": "no",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 910,
            "output_tokens": 4,
            "latency_ms": 587.5,
            "usd": 4.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 910.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 556.567,
        "usd_sum": 0.000141
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-002",
      "category": "dietary",
      "capability": "gluten_free_violation",
      "difficulty": "easy",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "04840e46f80ccb5b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nA kitchen claims this stir-fry is gluten-free: rice noodles, tofu, shiitake, broccoli, garlic, ginger, regular Japanese soy sauce, sesame oil, and scallions. Return the single ingredient that breaks the claim.\n\nINPUT:\n{\n  \"dish\": \"stir fry\",\n  \"ingredients\": [\n    \"rice noodles\",\n    \"tofu\",\n    \"shiitake\",\n    \"broccoli\",\n    \"garlic\",\n    \"ginger\",\n    \"regular Japanese soy sauce\",\n    \"sesame oil\",\n    \"scallions\"\n  ]\n}\n\nANSWER FORMAT: exact:string",
      "expected": {
        "value": "regular Japanese soy sauce",
        "aliases": [
          "soy sauce",
          "regular soy sauce"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "regular Japanese soy sauce",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "regular japanese soy sauce",
            "expected": "regular japanese soy sauce",
            "aliases": [
              "soy sauce",
              "regular soy sauce"
            ]
          },
          "metrics": {
            "input_tokens": 938,
            "output_tokens": 7,
            "latency_ms": 515.8,
            "usd": 5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "regular Japanese soy sauce",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "regular japanese soy sauce",
            "expected": "regular japanese soy sauce",
            "aliases": [
              "soy sauce",
              "regular soy sauce"
            ]
          },
          "metrics": {
            "input_tokens": 938,
            "output_tokens": 7,
            "latency_ms": 687.7,
            "usd": 5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "regular Japanese soy sauce",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "regular japanese soy sauce",
            "expected": "regular japanese soy sauce",
            "aliases": [
              "soy sauce",
              "regular soy sauce"
            ]
          },
          "metrics": {
            "input_tokens": 938,
            "output_tokens": 7,
            "latency_ms": 606.1,
            "usd": 5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 938.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 603.2,
        "usd_sum": 0.00015
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-003",
      "category": "dietary",
      "capability": "halal_compliance",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "013d6295fee472b5",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nIs a classic panna cotta (cream, sugar, vanilla, leaf gelatine) halal? Answer with yes or no.\n\nINPUT:\n{\n  \"dish\": \"panna cotta\",\n  \"ingredients\": [\n    \"cream\",\n    \"sugar\",\n    \"vanilla\",\n    \"leaf gelatine\"\n  ]\n}\n\nANSWER FORMAT: exact:yes|no",
      "expected": {
        "value": "no"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "no",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "no",
            "expected": "no",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 890,
            "output_tokens": 4,
            "latency_ms": 580.0,
            "usd": 4.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "yes",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "yes",
            "expected": "no",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 890,
            "output_tokens": 4,
            "latency_ms": 550.0,
            "usd": 4.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "no",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "no",
            "expected": "no",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 890,
            "output_tokens": 4,
            "latency_ms": 550.4,
            "usd": 4.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": 0.0133,
        "ci95_hi": 1.32,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 890.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 560.133,
        "usd_sum": 0.000138
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-004",
      "category": "dietary",
      "capability": "kosher_meat_dairy",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "85fc94958c4357ef",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nChicken parmigiana is breaded chicken breast topped with tomato sauce and melted mozzarella, served with a side of cream-rich garlic mashed potatoes. From a kosher perspective, what is the core violation? Respond with one short phrase.\n\nINPUT:\n{\n  \"dish\": \"chicken parmigiana with creamy mash\"\n}\n\nANSWER FORMAT: exact:string",
      "expected": {
        "value": "mixing meat and dairy",
        "aliases": [
          "meat and dairy mixed",
          "basar b'chalav",
          "poultry with dairy",
          "meat with dairy"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Unkosher meat-dairy mixing (no separate meat and dairy)",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "unkosher meat-dairy mixing (no separate meat and dairy)",
            "expected": "mixing meat and dairy",
            "aliases": [
              "meat and dairy mixed",
              "basar b'chalav",
              "poultry with dairy",
              "meat with dairy"
            ]
          },
          "metrics": {
            "input_tokens": 884,
            "output_tokens": 17,
            "latency_ms": 1254.8,
            "usd": 5.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Mixing meat (chicken) with dairy (mozzarella/cream)",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "mixing meat (chicken) with dairy (mozzarella/cream)",
            "expected": "mixing meat and dairy",
            "aliases": [
              "meat and dairy mixed",
              "basar b'chalav",
              "poultry with dairy",
              "meat with dairy"
            ]
          },
          "metrics": {
            "input_tokens": 884,
            "output_tokens": 19,
            "latency_ms": 587.9,
            "usd": 5.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Serves meat with dairy (mozzarella/cream mash)",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "serves meat with dairy (mozzarella/cream mash)",
            "expected": "mixing meat and dairy",
            "aliases": [
              "meat and dairy mixed",
              "basar b'chalav",
              "poultry with dairy",
              "meat with dairy"
            ]
          },
          "metrics": {
            "input_tokens": 884,
            "output_tokens": 16,
            "latency_ms": 825.1,
            "usd": 5.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 884.0,
        "output_tokens_mean": 17.333,
        "latency_ms_mean": 889.267,
        "usd_sum": 0.000154
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-005",
      "category": "dietary",
      "capability": "allergen_detection",
      "difficulty": "easy",
      "cuisine_region": "italian",
      "scorer": "set_f1",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "f40f4c25a2ee4b71",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nFor a dish of pesto genovese made from basil, garlic, pine nuts, parmesan, pecorino, and olive oil, list the EU Annex II allergens that must be declared. Return a comma-separated list using the OGS allergen IDs.\n\nINPUT:\n{\n  \"dish\": \"pesto genovese\",\n  \"ingredients\": [\n    \"basil\",\n    \"garlic\",\n    \"pine nuts\",\n    \"parmesan\",\n    \"pecorino\",\n    \"olive oil\"\n  ]\n}\n\nANSWER FORMAT: set:comma-separated OGS allergen IDs",
      "expected": {
        "values": [
          "tree_nuts",
          "milk"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "tree_nuts,milk",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": [
              "milk",
              "tree_nuts"
            ],
            "expected": [
              "milk",
              "tree_nuts"
            ],
            "precision": 1.0,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 933,
            "output_tokens": 8,
            "latency_ms": 965.9,
            "usd": 5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "tree_nuts, milk",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": [
              "milk",
              "tree_nuts"
            ],
            "expected": [
              "milk",
              "tree_nuts"
            ],
            "precision": 1.0,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 933,
            "output_tokens": 8,
            "latency_ms": 873.2,
            "usd": 5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "tree_nuts,milk",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": [
              "milk",
              "tree_nuts"
            ],
            "expected": [
              "milk",
              "tree_nuts"
            ],
            "precision": 1.0,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 933,
            "output_tokens": 8,
            "latency_ms": 704.4,
            "usd": 5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 933.0,
        "output_tokens_mean": 8.0,
        "latency_ms_mean": 847.833,
        "usd_sum": 0.00015
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-006",
      "category": "dietary",
      "capability": "allergen_detection",
      "difficulty": "medium",
      "cuisine_region": "thai",
      "scorer": "set_f1",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "7f45e388301a68a5",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nReturn the EU Annex II allergens that must be declared for a standard pad thai (rice noodles, shrimp, egg, tamarind, fish sauce, palm sugar, roasted peanuts, bean sprouts, garlic chives, lime, chili).\n\nINPUT:\n{\n  \"dish\": \"pad thai\"\n}\n\nANSWER FORMAT: set:comma-separated OGS allergen IDs",
      "expected": {
        "values": [
          "crustaceans",
          "eggs",
          "fish",
          "peanuts"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "crustaceans,eggs,fish,peanuts,sulphur_dioxide",
          "compliant": true,
          "score": 0.8889,
          "detail": {
            "predicted": [
              "crustaceans",
              "eggs",
              "fish",
              "peanuts",
              "sulphur_dioxide"
            ],
            "expected": [
              "crustaceans",
              "eggs",
              "fish",
              "peanuts"
            ],
            "precision": 0.8,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 887,
            "output_tokens": 21,
            "latency_ms": 822.5,
            "usd": 5.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "crustaceans,eggs,fish,peanuts",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": [
              "crustaceans",
              "eggs",
              "fish",
              "peanuts"
            ],
            "expected": [
              "crustaceans",
              "eggs",
              "fish",
              "peanuts"
            ],
            "precision": 1.0,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 887,
            "output_tokens": 15,
            "latency_ms": 671.0,
            "usd": 5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "crustaceans,eggs,fish,peanuts",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": [
              "crustaceans",
              "eggs",
              "fish",
              "peanuts"
            ],
            "expected": [
              "crustaceans",
              "eggs",
              "fish",
              "peanuts"
            ],
            "precision": 1.0,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 887,
            "output_tokens": 15,
            "latency_ms": 656.2,
            "usd": 5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.963,
      "score_summary": {
        "mean": 0.963,
        "stdev": 0.0641,
        "sem": 0.037,
        "ci95_lo": 0.8904,
        "ci95_hi": 1.0356,
        "n": 3
      },
      "score_if_valid": 0.963,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 887.0,
        "output_tokens_mean": 17.0,
        "latency_ms_mean": 716.567,
        "usd_sum": 0.000153
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-007",
      "category": "dietary",
      "capability": "vegan_substitution",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "eb092a2c162960f6",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nName a single plant-based ingredient commonly used to emulsify and thicken a vegan carbonara sauce in place of eggs, while also delivering umami. Respond with one short ingredient name.\n\nINPUT:\n{\n  \"target_role\": \"egg emulsifier and umami in vegan carbonara\"\n}\n\nANSWER FORMAT: exact:string",
      "expected": {
        "value": "silken tofu",
        "aliases": [
          "silken tofu blended with nutritional yeast",
          "soft tofu"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "soy sauce",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "soy sauce",
            "expected": "silken tofu",
            "aliases": [
              "silken tofu blended with nutritional yeast",
              "soft tofu"
            ]
          },
          "metrics": {
            "input_tokens": 879,
            "output_tokens": 5,
            "latency_ms": 601.7,
            "usd": 4.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "tahini",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "tahini",
            "expected": "silken tofu",
            "aliases": [
              "silken tofu blended with nutritional yeast",
              "soft tofu"
            ]
          },
          "metrics": {
            "input_tokens": 879,
            "output_tokens": 6,
            "latency_ms": 609.2,
            "usd": 4.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "tahini",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "tahini",
            "expected": "silken tofu",
            "aliases": [
              "silken tofu blended with nutritional yeast",
              "soft tofu"
            ]
          },
          "metrics": {
            "input_tokens": 879,
            "output_tokens": 6,
            "latency_ms": 429.7,
            "usd": 4.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 879.0,
        "output_tokens_mean": 5.667,
        "latency_ms_mean": 546.867,
        "usd_sum": 0.000138
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-008",
      "category": "dietary",
      "capability": "gluten_free_substitution",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "480be99b3f35a51d",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nSuggest a single widely-used gluten-free flour to make a béchamel roux that maintains classic texture. Respond with one word.\n\nINPUT:\n{\n  \"target_role\": \"béchamel roux, gluten-free\"\n}\n\nANSWER FORMAT: exact:string",
      "expected": {
        "value": "rice flour",
        "aliases": [
          "cornstarch",
          "corn starch",
          "cornflour",
          "tapioca starch",
          "tapioca flour"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "rice",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "rice",
            "expected": "rice flour",
            "aliases": [
              "cornstarch",
              "corn starch",
              "cornflour",
              "tapioca starch",
              "tapioca flour"
            ]
          },
          "metrics": {
            "input_tokens": 866,
            "output_tokens": 4,
            "latency_ms": 537.5,
            "usd": 4.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "rice",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "rice",
            "expected": "rice flour",
            "aliases": [
              "cornstarch",
              "corn starch",
              "cornflour",
              "tapioca starch",
              "tapioca flour"
            ]
          },
          "metrics": {
            "input_tokens": 866,
            "output_tokens": 4,
            "latency_ms": 603.4,
            "usd": 4.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "rice",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "rice",
            "expected": "rice flour",
            "aliases": [
              "cornstarch",
              "corn starch",
              "cornflour",
              "tapioca starch",
              "tapioca flour"
            ]
          },
          "metrics": {
            "input_tokens": 866,
            "output_tokens": 4,
            "latency_ms": 679.8,
            "usd": 4.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 866.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 606.9,
        "usd_sum": 0.000135
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-009",
      "category": "dietary",
      "capability": "vegan_compatibility",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "set_f1",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "936752891554dba3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nFrom this list, return every item that is NOT vegan as a comma-separated list: honey, agave nectar, wine clarified with isinglass, wine clarified with bentonite, carmine, nutritional yeast, shellac, maple syrup, worcestershire sauce (traditional), vegetable glycerin.\n\nINPUT:\n{\n  \"items\": [\n    \"honey\",\n    \"agave nectar\",\n    \"wine clarified with isinglass\",\n    \"wine clarified with bentonite\",\n    \"carmine\",\n    \"nutritional yeast\",\n    \"shellac\",\n    \"maple syrup\",\n    \"worcestershire sauce (traditional)\",\n    \"vegetable glycerin\"\n  ]\n}\n\nANSWER FORMAT: set:comma-separated item names",
      "expected": {
        "values": [
          "honey",
          "wine clarified with isinglass",
          "carmine",
          "shellac",
          "worcestershire sauce (traditional)"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "honey, wine clarified with isinglass, carmine, shellac",
          "compliant": true,
          "score": 0.8889,
          "detail": {
            "predicted": [
              "carmine",
              "honey",
              "shellac",
              "wine clarified with isinglass"
            ],
            "expected": [
              "carmine",
              "honey",
              "shellac",
              "wine clarified with isinglass",
              "worcestershire sauce (traditional)"
            ],
            "precision": 1.0,
            "recall": 0.8
          },
          "metrics": {
            "input_tokens": 963,
            "output_tokens": 18,
            "latency_ms": 3042.1,
            "usd": 5.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "honey, wine clarified with isinglass, carmine",
          "compliant": true,
          "score": 0.75,
          "detail": {
            "predicted": [
              "carmine",
              "honey",
              "wine clarified with isinglass"
            ],
            "expected": [
              "carmine",
              "honey",
              "shellac",
              "wine clarified with isinglass",
              "worcestershire sauce (traditional)"
            ],
            "precision": 1.0,
            "recall": 0.6
          },
          "metrics": {
            "input_tokens": 963,
            "output_tokens": 15,
            "latency_ms": 621.4,
            "usd": 5.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "honey, wine clarified with isinglass, carmine, shellac, worcestershire sauce (traditional), maple syrup, nutritional yeast",
          "compliant": true,
          "score": 0.8333,
          "detail": {
            "predicted": [
              "carmine",
              "honey",
              "maple syrup",
              "nutritional yeast",
              "shellac",
              "wine clarified with isinglass",
              "worcestershire sauce (traditional)"
            ],
            "expected": [
              "carmine",
              "honey",
              "shellac",
              "wine clarified with isinglass",
              "worcestershire sauce (traditional)"
            ],
            "precision": 0.7143,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 963,
            "output_tokens": 31,
            "latency_ms": 794.3,
            "usd": 6.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.8241,
      "score_summary": {
        "mean": 0.8241,
        "stdev": 0.0699,
        "sem": 0.0404,
        "ci95_lo": 0.745,
        "ci95_hi": 0.9032,
        "n": 3
      },
      "score_if_valid": 0.8241,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 963.0,
        "output_tokens_mean": 21.333,
        "latency_ms_mean": 1485.933,
        "usd_sum": 0.00017
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-010",
      "category": "dietary",
      "capability": "cross_contact_risk",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "set_f1",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "35c29846f76046d7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nA kitchen fries breaded calamari, battered cod, and french fries in a single deep fryer. Return the EU allergens that a customer ordering 'fries only' must be warned about as a comma-separated OGS allergen ID list.\n\nINPUT:\n{\n  \"scenario\": \"single shared fryer\",\n  \"items\": [\n    \"breaded calamari\",\n    \"battered cod\",\n    \"french fries\"\n  ]\n}\n\nANSWER FORMAT: set:comma-separated OGS allergen IDs",
      "expected": {
        "values": [
          "molluscs",
          "fish",
          "cereals_containing_gluten"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "cereals_containing_gluten,eggs,molluscs,fish",
          "compliant": true,
          "score": 0.8571,
          "detail": {
            "predicted": [
              "cereals_containing_gluten",
              "eggs",
              "fish",
              "molluscs"
            ],
            "expected": [
              "cereals_containing_gluten",
              "fish",
              "molluscs"
            ],
            "precision": 0.75,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 915,
            "output_tokens": 19,
            "latency_ms": 671.3,
            "usd": 5.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "fish,molluscs,cereals_containing_gluten,eggs,sulphur_dioxide",
          "compliant": true,
          "score": 0.75,
          "detail": {
            "predicted": [
              "cereals_containing_gluten",
              "eggs",
              "fish",
              "molluscs",
              "sulphur_dioxide"
            ],
            "expected": [
              "cereals_containing_gluten",
              "fish",
              "molluscs"
            ],
            "precision": 0.6,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 915,
            "output_tokens": 24,
            "latency_ms": 597.2,
            "usd": 5.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "fish,egg,crustaceans,molluscs",
          "compliant": true,
          "score": 0.5714,
          "detail": {
            "predicted": [
              "crustaceans",
              "egg",
              "fish",
              "molluscs"
            ],
            "expected": [
              "cereals_containing_gluten",
              "fish",
              "molluscs"
            ],
            "precision": 0.5,
            "recall": 0.6667
          },
          "metrics": {
            "input_tokens": 915,
            "output_tokens": 15,
            "latency_ms": 484.4,
            "usd": 5.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.7262,
      "score_summary": {
        "mean": 0.7262,
        "stdev": 0.1443,
        "sem": 0.0833,
        "ci95_lo": 0.5628,
        "ci95_hi": 0.8895,
        "n": 3
      },
      "score_if_valid": 0.7262,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 915.0,
        "output_tokens_mean": 19.333,
        "latency_ms_mean": 584.3,
        "usd_sum": 0.00016
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-011",
      "category": "dietary",
      "capability": "low_fodmap",
      "difficulty": "hard",
      "cuisine_region": "global",
      "scorer": "set_f1",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "bdd7970997c11172",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nReturn the low-FODMAP violations in this dish as a comma-separated list of ingredient names: garlic-infused olive oil, white onion, lactose-free milk, wheat pasta, lean chicken, fresh chives, honey, aged parmesan.\n\nINPUT:\n{\n  \"ingredients\": [\n    \"garlic-infused olive oil\",\n    \"white onion\",\n    \"lactose-free milk\",\n    \"wheat pasta\",\n    \"lean chicken\",\n    \"fresh chives\",\n    \"honey\",\n    \"aged parmesan\"\n  ]\n}\n\nANSWER FORMAT: set:comma-separated ingredient names",
      "expected": {
        "values": [
          "white onion",
          "wheat pasta",
          "honey"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "garlic-infused olive oil, white onion, honey, wheat pasta",
          "compliant": true,
          "score": 0.8571,
          "detail": {
            "predicted": [
              "garlic-infused olive oil",
              "honey",
              "wheat pasta",
              "white onion"
            ],
            "expected": [
              "honey",
              "wheat pasta",
              "white onion"
            ],
            "precision": 0.75,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 932,
            "output_tokens": 17,
            "latency_ms": 678.0,
            "usd": 5.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "garlic-infused olive oil, white onion, wheat pasta, honey",
          "compliant": true,
          "score": 0.8571,
          "detail": {
            "predicted": [
              "garlic-infused olive oil",
              "honey",
              "wheat pasta",
              "white onion"
            ],
            "expected": [
              "honey",
              "wheat pasta",
              "white onion"
            ],
            "precision": 0.75,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 932,
            "output_tokens": 17,
            "latency_ms": 921.1,
            "usd": 5.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "garlic-infused olive oil, white onion, wheat pasta, honey, aged parmesan",
          "compliant": true,
          "score": 0.75,
          "detail": {
            "predicted": [
              "aged parmesan",
              "garlic-infused olive oil",
              "honey",
              "wheat pasta",
              "white onion"
            ],
            "expected": [
              "honey",
              "wheat pasta",
              "white onion"
            ],
            "precision": 0.6,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 932,
            "output_tokens": 20,
            "latency_ms": 557.9,
            "usd": 5.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.8214,
      "score_summary": {
        "mean": 0.8214,
        "stdev": 0.0618,
        "sem": 0.0357,
        "ci95_lo": 0.7514,
        "ci95_hi": 0.8914,
        "n": 3
      },
      "score_if_valid": 0.8214,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 932.0,
        "output_tokens_mean": 18.0,
        "latency_ms_mean": 719.0,
        "usd_sum": 0.000161
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-012",
      "category": "dietary",
      "capability": "pescatarian_vs_vegetarian",
      "difficulty": "easy",
      "cuisine_region": "japanese",
      "scorer": "set_f1",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "42c897d741064981",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nA dish contains miso, dashi (made with katsuobushi), tofu, and wakame. Mark which labels it qualifies for out of: vegan, vegetarian, pescatarian. Return a comma-separated list.\n\nINPUT:\n{\n  \"dish\": \"classic awase-dashi miso soup with tofu and wakame\"\n}\n\nANSWER FORMAT: set:comma-separated labels",
      "expected": {
        "values": [
          "pescatarian"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "vegetarian",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": [
              "vegetarian"
            ],
            "expected": [
              "pescatarian"
            ],
            "precision": 0.0,
            "recall": 0.0
          },
          "metrics": {
            "input_tokens": 892,
            "output_tokens": 5,
            "latency_ms": 589.8,
            "usd": 4.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "pescatarian,vegetarian",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "pescatarian",
              "vegetarian"
            ],
            "expected": [
              "pescatarian"
            ],
            "precision": 0.5,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 892,
            "output_tokens": 10,
            "latency_ms": 593.4,
            "usd": 4.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "pescatarian,vegetarian",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "pescatarian",
              "vegetarian"
            ],
            "expected": [
              "pescatarian"
            ],
            "precision": 0.5,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 892,
            "output_tokens": 10,
            "latency_ms": 516.9,
            "usd": 4.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.4445,
      "score_summary": {
        "mean": 0.4445,
        "stdev": 0.3849,
        "sem": 0.2222,
        "ci95_lo": 0.0089,
        "ci95_hi": 0.88,
        "n": 3
      },
      "score_if_valid": 0.4445,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 892.0,
        "output_tokens_mean": 8.333,
        "latency_ms_mean": 566.7,
        "usd_sum": 0.000145
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-001",
      "category": "menu_engineering",
      "capability": "matrix_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "138c09ed611a2d43",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA dish has above-average popularity and above-average contribution margin on the menu. Using the OGS menu-engineering matrix, which quadrant is it? Respond with one word.\n\nINPUT:\n{\n  \"popularity\": \"above\",\n  \"margin\": \"above\"\n}\n\nANSWER FORMAT: exact:star|plowhorse|puzzle|dog",
      "expected": {
        "value": "star"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "star",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "star",
            "expected": "star",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 498,
            "output_tokens": 4,
            "latency_ms": 647.4,
            "usd": 2.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "star",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "star",
            "expected": "star",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 498,
            "output_tokens": 4,
            "latency_ms": 583.3,
            "usd": 2.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "star",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "star",
            "expected": "star",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 498,
            "output_tokens": 4,
            "latency_ms": 638.2,
            "usd": 2.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 498.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 622.967,
        "usd_sum": 8.1e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-002",
      "category": "menu_engineering",
      "capability": "matrix_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "b5df321fd829de76",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA dish sells strongly but the contribution margin is below the menu average. Classify it.\n\nINPUT:\n{\n  \"popularity\": \"above\",\n  \"margin\": \"below\"\n}\n\nANSWER FORMAT: exact:star|plowhorse|puzzle|dog",
      "expected": {
        "value": "plowhorse"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "plowhorse",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "plowhorse",
            "expected": "plowhorse",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 482,
            "output_tokens": 6,
            "latency_ms": 611.8,
            "usd": 2.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "plowhorse",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "plowhorse",
            "expected": "plowhorse",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 482,
            "output_tokens": 6,
            "latency_ms": 575.3,
            "usd": 2.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "plowhorse",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "plowhorse",
            "expected": "plowhorse",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 482,
            "output_tokens": 6,
            "latency_ms": 523.5,
            "usd": 2.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 482.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 570.2,
        "usd_sum": 8.1e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-003",
      "category": "menu_engineering",
      "capability": "matrix_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "9795b72bdda437f1",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA dish has strong contribution margin but is rarely ordered. Classify it.\n\nINPUT:\n{\n  \"popularity\": \"below\",\n  \"margin\": \"above\"\n}\n\nANSWER FORMAT: exact:star|plowhorse|puzzle|dog",
      "expected": {
        "value": "puzzle"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "puzzle",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "puzzle",
            "expected": "puzzle",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 479,
            "output_tokens": 5,
            "latency_ms": 721.8,
            "usd": 2.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "puzzle",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "puzzle",
            "expected": "puzzle",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 479,
            "output_tokens": 5,
            "latency_ms": 423.7,
            "usd": 2.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "puzzle",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "puzzle",
            "expected": "puzzle",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 479,
            "output_tokens": 5,
            "latency_ms": 541.0,
            "usd": 2.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 479.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 562.167,
        "usd_sum": 7.8e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-004",
      "category": "menu_engineering",
      "capability": "matrix_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "fc0da6dcbbc842fb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA dish is rarely ordered and has a contribution margin below the menu average. Classify it.\n\nINPUT:\n{\n  \"popularity\": \"below\",\n  \"margin\": \"below\"\n}\n\nANSWER FORMAT: exact:star|plowhorse|puzzle|dog",
      "expected": {
        "value": "dog"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "dog",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "dog",
            "expected": "dog",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 483,
            "output_tokens": 4,
            "latency_ms": 508.9,
            "usd": 2.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "dog",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "dog",
            "expected": "dog",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 483,
            "output_tokens": 4,
            "latency_ms": 535.5,
            "usd": 2.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "dog",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "dog",
            "expected": "dog",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 483,
            "output_tokens": 4,
            "latency_ms": 628.0,
            "usd": 2.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 483.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 557.467,
        "usd_sum": 7.8e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-005",
      "category": "menu_engineering",
      "capability": "food_cost",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "f33710f88f89121e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA dish has a food cost of £4.20 and a menu price of £14.00. Return the food-cost percentage as a single integer.\n\nINPUT:\n{\n  \"food_cost\": 4.2,\n  \"menu_price\": 14.0\n}\n\nANSWER FORMAT: numeric:integer percent",
      "expected": {
        "value": 30,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "30",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 30.0,
            "expected": 30.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 492,
            "output_tokens": 4,
            "latency_ms": 594.4,
            "usd": 2.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "30",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 30.0,
            "expected": 30.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 492,
            "output_tokens": 4,
            "latency_ms": 503.9,
            "usd": 2.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "30",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 30.0,
            "expected": 30.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 492,
            "output_tokens": 4,
            "latency_ms": 514.6,
            "usd": 2.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 492.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 537.633,
        "usd_sum": 7.8e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-006",
      "category": "menu_engineering",
      "capability": "contribution_margin",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "7bb019ca9ca4b1be",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA dish has a food cost of $6.50 and a menu price of $22.00. Return the contribution margin in dollars as a single number with two decimals.\n\nINPUT:\n{\n  \"food_cost\": 6.5,\n  \"menu_price\": 22.0\n}\n\nANSWER FORMAT: numeric:decimal USD",
      "expected": {
        "value": 15.5,
        "tolerance": 0.05
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "15.50",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 15.5,
            "expected": 15.5,
            "tolerance": 0.05
          },
          "metrics": {
            "input_tokens": 496,
            "output_tokens": 6,
            "latency_ms": 570.2,
            "usd": 2.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "15.50",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 15.5,
            "expected": 15.5,
            "tolerance": 0.05
          },
          "metrics": {
            "input_tokens": 496,
            "output_tokens": 6,
            "latency_ms": 608.1,
            "usd": 2.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "15.50",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 15.5,
            "expected": 15.5,
            "tolerance": 0.05
          },
          "metrics": {
            "input_tokens": 496,
            "output_tokens": 6,
            "latency_ms": 547.3,
            "usd": 2.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 496.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 575.2,
        "usd_sum": 8.1e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-007",
      "category": "menu_engineering",
      "capability": "menu_rebalance",
      "difficulty": "hard",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "68b7fb2ec1e432f4",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nFive main courses have this (popularity %, margin $) profile: A (35, 18), B (30, 9), C (5, 6), D (20, 16), E (10, 7). Average popularity is 20%; average margin is $11.20. Which single dish is the dog? Respond with the letter only.\n\nINPUT:\n{\n  \"dishes\": [\n    {\n      \"id\": \"A\",\n      \"pop\": 35,\n      \"margin\": 18\n    },\n    {\n      \"id\": \"B\",\n      \"pop\": 30,\n      \"margin\": 9\n    },\n    {\n      \"id\": \"C\",\n      \"pop\": 5,\n      \"margin\": 6\n    },\n    {\n      \"id\": \"D\",\n      \"pop\": 20,\n      \"margin\": 16\n    },\n    {\n      \"id\": \"E\",\n      \"pop\": 10,\n      \"margin\": 7\n    }\n  ],\n  \"avg_pop\": 20,\n  \"avg_margin\": 11.2\n}\n\nANSWER FORMAT: exact:A|B|C|D|E",
      "expected": {
        "value": "C"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "B",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "b",
            "expected": "c",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 674,
            "output_tokens": 4,
            "latency_ms": 539.0,
            "usd": 3.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "B",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "b",
            "expected": "c",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 674,
            "output_tokens": 4,
            "latency_ms": 539.9,
            "usd": 3.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "B",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "b",
            "expected": "c",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 674,
            "output_tokens": 4,
            "latency_ms": 536.2,
            "usd": 3.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 674.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 538.367,
        "usd_sum": 0.000105
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-008",
      "category": "menu_engineering",
      "capability": "target_price",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "dc5bf7ed3a0583c2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA chef wants a 28% food cost on a dish whose ingredient cost is £3.92. Return the target menu price in pounds, two decimals.\n\nINPUT:\n{\n  \"food_cost\": 3.92,\n  \"target_food_cost_pct\": 28\n}\n\nANSWER FORMAT: numeric:decimal GBP",
      "expected": {
        "value": 14.0,
        "tolerance": 0.05
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "£14.00",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 14.0,
            "expected": 14.0,
            "tolerance": 0.05
          },
          "metrics": {
            "input_tokens": 493,
            "output_tokens": 7,
            "latency_ms": 567.7,
            "usd": 2.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "£14.00",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 14.0,
            "expected": 14.0,
            "tolerance": 0.05
          },
          "metrics": {
            "input_tokens": 493,
            "output_tokens": 7,
            "latency_ms": 619.2,
            "usd": 2.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "14.00",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 14.0,
            "expected": 14.0,
            "tolerance": 0.05
          },
          "metrics": {
            "input_tokens": 493,
            "output_tokens": 6,
            "latency_ms": 15276.2,
            "usd": 2.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 493.0,
        "output_tokens_mean": 6.667,
        "latency_ms_mean": 5487.7,
        "usd_sum": 8.1e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-009",
      "category": "menu_engineering",
      "capability": "menu_critique",
      "difficulty": "hard",
      "cuisine_region": "global",
      "scorer": "rubric_judge",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles",
        "vocab.match_types",
        "rules.dietary",
        "rules.allergens_eu"
      ],
      "prompt_template_hash": "8883cacb0aca5024",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n---\n\nCritique the following five-course autumn tasting menu. Identify at least two concrete weaknesses (flow, seasonality, allergen risk, cost-to-price ratio, or structural monotony), suggest one specific, constructive change, and justify each point in 3 short paragraphs. Avoid generic praise.\n\nINPUT:\n{\n  \"menu\": [\n    {\n      \"course\": \"amuse\",\n      \"dish\": \"pumpkin velouté, sage oil\"\n    },\n    {\n      \"course\": \"appetizer\",\n      \"dish\": \"butternut squash risotto, brown butter, amaretti crumble\"\n    },\n    {\n      \"course\": \"fish\",\n      \"dish\": \"pan-seared scallops, pumpkin purée, crispy pancetta\"\n    },\n    {\n      \"course\": \"main\",\n      \"dish\": \"roast pork loin, sweet potato purée, cider jus\"\n    },\n    {\n      \"course\": \"dessert\",\n      \"dish\": \"pumpkin cheesecake, spiced pecan brittle\"\n    }\n  ],\n  \"target_food_cost_pct\": 30,\n  \"dietary_constraints\": \"must offer a viable vegetarian and a viable nut-free path\"\n}\n\nANSWER FORMAT: free text, 3 short paragraphs",
      "expected": {
        "rubric": [
          {
            "id": "diagnosis",
            "weight": 0.4,
            "description": "Identifies concrete, defensible weaknesses (flavor monotony from repeated pumpkin/squash, nut allergen in amaretti and pecans, heavy starches in consecutive courses).",
            "anchors": {
              "0.0": "No specific weakness identified.",
              "0.5": "Identifies one concrete weakness but misses another major one.",
              "1.0": "Identifies at least two of: pumpkin/squash repetition across 4 of 5 courses; nut allergen across amaretti and pecans; consecutive purée/starch heaviness."
            }
          },
          {
            "id": "remediation",
            "weight": 0.3,
            "description": "Offers a specific, actionable, seasonally-plausible change that addresses the identified weakness.",
            "anchors": {
              "0.0": "No change proposed or change is vague.",
              "0.5": "Change is specific but does not address the main weakness.",
              "1.0": "Concrete substitute with named ingredient/technique (e.g. swap scallop course to cured mackerel with pickled apple)."
            }
          },
          {
            "id": "justification",
            "weight": 0.3,
            "description": "Justification is grounded in OGS concepts (match types, structural balance, service flow) rather than marketing language.",
            "anchors": {
              "0.0": "Pure adjective salad.",
              "0.5": "Some reasoning, but appeals to authority not mechanism.",
              "1.0": "Explicitly cites a mechanism: intensity alignment, contrast, allergen cross-contact, food-cost math."
            }
          }
        ],
        "reference_answer": "Key weaknesses: (1) pumpkin/squash appears in 4 of 5 courses, collapsing the menu's flavour arc into a single beta-carotene/sweet-starch note; (2) the amaretti crumble (bitter almond) and the pecan brittle together block every nut-allergic guest from the tasting; (3) velouté, risotto, two purées and a cheesecake stack four viscous/starchy textures in a row with no acid or raw element. A concrete fix is to replace the scallop course with cured mackerel over pickled Granny Smith apple and wasabi-lime gel: it re-introduces sea/fruit/acid, breaks the texture loop, removes one pork-cross-contact dish, and simultaneously opens a pescatarian-friendly option. Justification: on OGS terms this swap adds a `contrast` match (acid/fat), raises `acidity_perception` mid-meal to refresh the palate before the roast, and removes one of the two tree-nut exposure points."
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "First weakness: structural monotony and seasonality echo. You repeat “pumpkin” in three of five courses (amuse, fish styling, and dessert), and “squash/pumpkin” dominates the entire arc, which flattens the sensory progression. After the pumpkin velouté, the appetizer is also squash-forward with warm, dairy/cheesy texture (risotto), and the fish again anchors on pumpkin purée—so the menu shifts mainly by protein, not by contrast (temperature, acidity, crunch, or aromatic direction). This is autumn-themed, but it reads like the same base ingredient re-skinned rather than a curated seasonal ladder.\n\nSecond weakness: allergen risk and nut-free feasibility conflicts with the dessert and appetizer. Amaretti crumble and spiced pecan brittle are both nut-adjacent pain points: amaretti commonly contains almond and other ingredients that may not be nut-free-safe, while pecan brittle is explicitly tree-nut based. Given your constraint to offer a viable nut-free path, these two items either force a major substitution or risk cross-contact issues (shared fryers, brittle production surfaces, garnish handling). That’s not just “allergy paperwork”—it directly undermines the nut-free path viability unless you redesign at least one of these courses.\n\nSpecific constructive change: keep the autumn concept but swap the nut-containing element in the appetizer and dessert with a truly nut-free crunch/sweet finish, and use that substitution to reintroduce contrast. For example, replace “amaretti crumble” with a toasted oat–seed crumble (check for gluten risk separately, and use sesame seeds only if you can manage the allergen label implications), and replace “spiced pecan brittle” with a spiced pumpkin-seed brittle or roasted oat brittle. Justification: this removes the primary nut-free blocker in two places, reduces cross-contact exposure, and lets you change texture twice (soft risotto → toasted crumble; creamy cheesecake → harder brittle) without changing the rest of the seasonal palette.",
          "compliant": true,
          "score": 0.88,
          "detail": {
            "per_criterion": [
              {
                "id": "diagnosis",
                "weight": 0.4,
                "score": 1.0,
                "note": "It clearly identifies two major weaknesses from the rubric: repeated pumpkin/squash causing monotony and nut-allergen conflicts blocking a viable nut-free path."
              },
              {
                "id": "remediation",
                "weight": 0.3,
                "score": 0.7,
                "note": "It proposes concrete substitutions for amaretti and pecan brittle, but the change addresses allergen risk more than the broader flavor/flow monotony it diagnoses."
              },
              {
                "id": "justification",
                "weight": 0.3,
                "score": 0.9,
                "note": "The reasoning is mechanism-based, citing sensory progression, contrast, texture sequencing, and cross-contact risk rather than generic praise."
              }
            ],
            "summary": "The answer is in the requested three-paragraph free-text format and avoids generic praise. It gives a strong diagnosis and practical allergen-focused fixes, though its proposed change only partially resolves the menu's larger repetition and flow problem.",
            "judge_model": "gpt-5.4",
            "judge_prompt_hash": "756db6a487714bca",
            "judge_validated": false
          },
          "metrics": {
            "input_tokens": 1251,
            "output_tokens": 412,
            "latency_ms": 2774.6,
            "usd": 0.000227,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Your menu’s allergen and dietary pathing is the biggest risk. “Amaretti crumble” and “spiced pecan brittle” introduce nuts (pecans) and also typical gluten-containing cookie components (amaretti), directly colliding with the “viable nut-free path.” Meanwhile, the scallops dish adds a mollusc allergen risk (“molluscs” in EU terms) and pancetta adds pork (fine for omnivores but constrains vegetarian substitutions). As written, there’s no clear parallel nut-free/vegetarian build—so meeting both requirements likely means swapping multiple dishes rather than offering “a path” through the same service.\n\nFlow is also structurally weak because pumpkin dominates across four of five courses (“pumpkin velouté,” “butternut squash risotto,” “pumpkin purée” with scallops, and pumpkin cheesecake). That creates seasonality repetition rather than autumn progression: you’re tasting very similar sweet, starch-forward squash notes in starter, entrée, and dessert. Additionally, the palate may feel heavy because you stack richness drivers: brown butter in the risotto, pancetta with scallops, pork loin with cider jus, and cheesecake after already having multiple creamy/umami components. The fish-to-main transition is especially likely to be clashing rather than lifting—there’s no palate-reset element (acid/brightness) between the second and third courses.\n\nOne specific constructive change: replace the appetizer’s “amaretti crumble” with a nut-free, gluten-aware autumn topping and add a sharper, non-pumpkin brightness element to that course (e.g., a citrusy, herb-forward component). Concretely: keep “butternut squash risotto” but finish with a nut-free crumble that avoids “amaretti” (and avoids tree nuts like pecans), and add an acid accent (lemon/verjus foam or pickled quince strands) to sharpen the transition into the fish course. This single course adjustment improves the nut-free feasibility (removes the pecan/amaretti issue), breaks the pumpkin monotony by introducing a different flavor axis (tart/bright rather than only sweet-squash), and makes the whole progression feel more intentional without requiring a full menu rewrite.",
          "compliant": true,
          "score": 0.85,
          "detail": {
            "per_criterion": [
              {
                "id": "diagnosis",
                "weight": 0.4,
                "score": 1.0,
                "note": "It clearly identifies multiple concrete weaknesses: repeated pumpkin/squash across four courses, nut-allergen conflicts from amaretti and pecans, and a heavy rich/starchy flow with limited palate reset."
              },
              {
                "id": "remediation",
                "weight": 0.3,
                "score": 0.5,
                "note": "It proposes a specific adjustment to the risotto course, but the change is only partly concrete and overclaims by saying it removes the pecan issue without altering dessert."
              },
              {
                "id": "justification",
                "weight": 0.3,
                "score": 1.0,
                "note": "The reasoning is mechanism-based, citing dietary path viability, palate heaviness, lack of acid/brightness, and service-flow effects rather than generic praise."
              }
            ],
            "summary": "The answer follows the requested three-paragraph free-text format and gives a strong critique grounded in menu structure and allergen logic. Its main weakness is that the proposed fix is only partially sufficient, since changing the appetizer alone does not fully solve the nut-free path problem created by the pecan dessert.",
            "judge_model": "gpt-5.4",
            "judge_prompt_hash": "756db6a487714bca",
            "judge_validated": false
          },
          "metrics": {
            "input_tokens": 1251,
            "output_tokens": 458,
            "latency_ms": 3041.3,
            "usd": 0.000246,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "The menu has a structural monotony around pumpkin/squash and “autumn fat + sweet spice,” which makes the palate repeat rather than evolve. Velouté → risotto → scallops + purée → pork + purée → cheesecake all lean on warm, orange sweetness and creamy texture, so the progression risks becoming one-note (and you also lose a clean “reset” between savory courses and dessert). A concrete flow weakness: there’s no genuinely bright, cleansing element (acid-forward palate cleanser) after the rich appetizer and before the richer fish/pork touches, so diners may feel increasingly heavy.\n\nThere are also allergen and feasibility risks against the stated vegetarian and nut-free requirements. “Amaretti crumble” and “spiced pecan brittle” introduce nuts directly (pecan) and likely almond-derived amaretti, which conflicts with a nut-free path; additionally, risotto often contains Parmesan (milk) and brown butter is dairy. For vegetarian viability, the fish and pork courses make the overall tasting non-viable unless you provide substitutions; as written, there’s no explicit vegetarian track, and there’s no obvious mechanism to keep the menu “paired” in flavor/texture across courses.\n\nOne constructive change: keep the pumpkin theme but introduce a single acid-and-herb counterpoint course element that appears as a component across multiple courses (e.g., an apple-cider vinegar pickled shallot or apple + sorrel salad/gel) while replacing the nut components with nut-free alternatives. Concretely, swap “amaretti crumble” and “spiced pecan brittle” for a nut-free crunchy using toasted oat crumb or toasted seed brittle (e.g., pepitas where permitted by your allergy policy) and add an acid “refresh” element to at least the fish or main (before moving to dessert). This directly fixes two weaknesses at once: it improves palate flow through sharper acid contrast (less fatigue) and removes the nut allergen pressure so you can maintain a viable nut-free path without rewriting the entire menu.",
          "compliant": true,
          "score": 0.91,
          "detail": {
            "per_criterion": [
              {
                "id": "diagnosis",
                "weight": 0.4,
                "score": 1.0,
                "note": "It clearly identifies multiple concrete weaknesses: repeated pumpkin/squash flavor monotony, heavy puree/starch texture repetition, and nut-allergen conflicts with the stated nut-free requirement."
              },
              {
                "id": "remediation",
                "weight": 0.3,
                "score": 0.8,
                "note": "It proposes specific substitutions like toasted oat crumb or seed brittle and adding an apple-cider pickled acidic element, though the change is somewhat diffuse rather than a single sharply defined course swap."
              },
              {
                "id": "justification",
                "weight": 0.3,
                "score": 0.9,
                "note": "The reasoning is grounded in service flow, palate fatigue, acid contrast, and allergen feasibility, with explicit mechanisms rather than generic praise."
              }
            ],
            "summary": "The answer follows the requested three-paragraph free-text format and avoids generic praise. It diagnoses the menu well and offers actionable fixes with solid mechanism-based justification, though the remediation is broader and less singular than the strongest possible response.",
            "judge_model": "gpt-5.4",
            "judge_prompt_hash": "756db6a487714bca",
            "judge_validated": false
          },
          "metrics": {
            "input_tokens": 1251,
            "output_tokens": 414,
            "latency_ms": 2905.1,
            "usd": 0.000228,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.88,
      "score_summary": {
        "mean": 0.88,
        "stdev": 0.03,
        "sem": 0.0173,
        "ci95_lo": 0.8461,
        "ci95_hi": 0.9139,
        "n": 3
      },
      "score_if_valid": 0.88,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1251.0,
        "output_tokens_mean": 428.0,
        "latency_ms_mean": 2907.0,
        "usd_sum": 0.000701
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-ar-001",
      "category": "vocabulary",
      "capability": "ingredient_classification",
      "difficulty": "easy",
      "cuisine_region": "middle_eastern",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.ingredient_categories"
      ],
      "prompt_template_hash": "94e19d81d80368a1",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n---\n\nباستخدام مفردات OGS لتصنيفات المكونات، صنّف الزعتر بكلمة واحدة.\n\nINPUT:\n{\n  \"item\": \"زعتر\"\n}\n\nANSWER FORMAT: one_of:protein,vegetable,fruit,grain,dairy,fat,herb,spice,fungus,legume,nut,seed,sweetener,acid,salt,other",
      "expected": {
        "value": "herb",
        "aliases": [
          "spice"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "herb",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herb",
            "expected": "herb",
            "aliases": [
              "spice"
            ]
          },
          "metrics": {
            "input_tokens": 342,
            "output_tokens": 5,
            "latency_ms": 591.0,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "herb",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herb",
            "expected": "herb",
            "aliases": [
              "spice"
            ]
          },
          "metrics": {
            "input_tokens": 342,
            "output_tokens": 5,
            "latency_ms": 674.8,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "herb",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herb",
            "expected": "herb",
            "aliases": [
              "spice"
            ]
          },
          "metrics": {
            "input_tokens": 342,
            "output_tokens": 5,
            "latency_ms": 509.6,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 342.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 591.8,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-ar-002",
      "category": "regional",
      "capability": "region_identification",
      "difficulty": "easy",
      "cuisine_region": "middle_eastern",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "6b39ec376be689ad",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nالمنسف هو الطبق الوطني لأي بلد عربي؟ أجب بكلمة واحدة.\n\nINPUT:\n{\n  \"dish\": \"المنسف\"\n}\n\nANSWER FORMAT: exact:country",
      "expected": {
        "value": "jordan",
        "aliases": [
          "الأردن",
          "jordan (الأردن)"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Jordan",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "jordan",
            "expected": "jordan",
            "aliases": [
              "الأردن",
              "jordan (الأردن)"
            ]
          },
          "metrics": {
            "input_tokens": 259,
            "output_tokens": 4,
            "latency_ms": 556.4,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "الأردن",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "الأردن",
            "expected": "jordan",
            "aliases": [
              "الأردن",
              "jordan (الأردن)"
            ]
          },
          "metrics": {
            "input_tokens": 259,
            "output_tokens": 6,
            "latency_ms": 515.4,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "الأردن",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "الأردن",
            "expected": "jordan",
            "aliases": [
              "الأردن",
              "jordan (الأردن)"
            ]
          },
          "metrics": {
            "input_tokens": 259,
            "output_tokens": 6,
            "latency_ms": 458.1,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 259.0,
        "output_tokens_mean": 5.333,
        "latency_ms_mean": 509.967,
        "usd_sum": 4.5e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-es-001",
      "category": "regional",
      "capability": "region_identification",
      "difficulty": "easy",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "5fe5c7f1f85e3675",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nEl pulpo a feira es un plato tradicional de qué región de España? Responde con una palabra.\n\nINPUT:\n{\n  \"dish\": \"pulpo a feira\"\n}\n\nANSWER FORMAT: exact:region name",
      "expected": {
        "value": "galicia",
        "aliases": [
          "galícia",
          "galiza"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Galicia",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "galicia",
            "expected": "galicia",
            "aliases": [
              "galícia",
              "galiza"
            ]
          },
          "metrics": {
            "input_tokens": 263,
            "output_tokens": 5,
            "latency_ms": 630.8,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Galicia",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "galicia",
            "expected": "galicia",
            "aliases": [
              "galícia",
              "galiza"
            ]
          },
          "metrics": {
            "input_tokens": 263,
            "output_tokens": 5,
            "latency_ms": 640.9,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Galicia",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "galicia",
            "expected": "galicia",
            "aliases": [
              "galícia",
              "galiza"
            ]
          },
          "metrics": {
            "input_tokens": 263,
            "output_tokens": 5,
            "latency_ms": 749.3,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 263.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 673.667,
        "usd_sum": 4.5e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-es-002",
      "category": "vocabulary",
      "capability": "cooking_method_classification",
      "difficulty": "easy",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "9f03b90711bec40f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nUsando el vocabulario OGS de métodos de cocción, ¿cuál describe mejor la preparación clásica del cochinillo segoviano? Responde con una palabra.\n\nINPUT:\n{\n  \"item\": \"cochinillo segoviano\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "roasted"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "roasted",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "roasted",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 396,
            "output_tokens": 5,
            "latency_ms": 568.6,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "roasted",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "roasted",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 396,
            "output_tokens": 5,
            "latency_ms": 653.0,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "roasted",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "roasted",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 396,
            "output_tokens": 5,
            "latency_ms": 587.6,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 396.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 603.067,
        "usd_sum": 6.6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-fr-001",
      "category": "regional",
      "capability": "region_identification",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "8d252ae3fb170e15",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nLa bouillabaisse est une préparation traditionnelle de quelle ville française ? Répondez par un mot.\n\nINPUT:\n{\n  \"dish\": \"bouillabaisse\"\n}\n\nANSWER FORMAT: exact:city name",
      "expected": {
        "value": "marseille"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Marseille",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "marseille",
            "expected": "marseille",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 263,
            "output_tokens": 5,
            "latency_ms": 610.2,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Marseille",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "marseille",
            "expected": "marseille",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 263,
            "output_tokens": 5,
            "latency_ms": 512.8,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Marseille",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "marseille",
            "expected": "marseille",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 263,
            "output_tokens": 5,
            "latency_ms": 563.9,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 263.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 562.3,
        "usd_sum": 4.5e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-fr-002",
      "category": "sensory",
      "capability": "basic_tastes_profile",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "96a07aeab44536ab",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nPour une sauce beurre blanc classique, donnez le profil sur l'échelle OGS des cinq saveurs [sweet, sour, salty, bitter, umami] sous la forme d'un tableau JSON de cinq nombres entre 0 et 10.\n\nINPUT:\n{\n  \"dish\": \"sauce beurre blanc classique\"\n}\n\nANSWER FORMAT: json:array of 5 numbers",
      "expected": {
        "values": [
          1,
          5,
          4,
          1,
          3
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[1.0, 4.5, 3.5, 0.5, 2.0]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 1.0,
                "expected": 1,
                "score": 1.0
              },
              {
                "index": 1,
                "predicted": 4.5,
                "expected": 5,
                "score": 0.6667
              },
              {
                "index": 2,
                "predicted": 3.5,
                "expected": 4,
                "score": 0.6667
              },
              {
                "index": 3,
                "predicted": 0.5,
                "expected": 1,
                "score": 0.6667
              },
              {
                "index": 4,
                "predicted": 2.0,
                "expected": 3,
                "score": 0.3333
              }
            ],
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 372,
            "output_tokens": 28,
            "latency_ms": 690.4,
            "usd": 3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[2.5,6.5,3.0,0.5,2.0]",
          "compliant": true,
          "score": 0.2667,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 2.5,
                "expected": 1,
                "score": 0.0
              },
              {
                "index": 1,
                "predicted": 6.5,
                "expected": 5,
                "score": 0.0
              },
              {
                "index": 2,
                "predicted": 3.0,
                "expected": 4,
                "score": 0.3333
              },
              {
                "index": 3,
                "predicted": 0.5,
                "expected": 1,
                "score": 0.6667
              },
              {
                "index": 4,
                "predicted": 2.0,
                "expected": 3,
                "score": 0.3333
              }
            ],
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 372,
            "output_tokens": 24,
            "latency_ms": 618.8,
            "usd": 2.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[1.0,4.0,2.0,0.2,3.0]",
          "compliant": true,
          "score": 0.56,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 1.0,
                "expected": 1,
                "score": 1.0
              },
              {
                "index": 1,
                "predicted": 4.0,
                "expected": 5,
                "score": 0.3333
              },
              {
                "index": 2,
                "predicted": 2.0,
                "expected": 4,
                "score": 0.0
              },
              {
                "index": 3,
                "predicted": 0.2,
                "expected": 1,
                "score": 0.4667
              },
              {
                "index": 4,
                "predicted": 3.0,
                "expected": 3,
                "score": 1.0
              }
            ],
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 372,
            "output_tokens": 24,
            "latency_ms": 698.1,
            "usd": 2.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.4978,
      "score_summary": {
        "mean": 0.4978,
        "stdev": 0.2071,
        "sem": 0.1196,
        "ci95_lo": 0.2634,
        "ci95_hi": 0.7322,
        "n": 3
      },
      "score_if_valid": 0.4978,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 372.0,
        "output_tokens_mean": 25.333,
        "latency_ms_mean": 669.1,
        "usd_sum": 8.6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-it-001",
      "category": "regional",
      "capability": "region_identification",
      "difficulty": "easy",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "c3aae88ebfcbe08b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nI pici sono una pasta fresca tipica di quale regione italiana? Rispondi con una parola.\n\nINPUT:\n{\n  \"dish\": \"pici cacio e pepe\"\n}\n\nANSWER FORMAT: exact:regione",
      "expected": {
        "value": "toscana"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Toscana",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "toscana",
            "expected": "toscana",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 265,
            "output_tokens": 6,
            "latency_ms": 735.8,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Toscana",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "toscana",
            "expected": "toscana",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 265,
            "output_tokens": 6,
            "latency_ms": 588.3,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Toscana",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "toscana",
            "expected": "toscana",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 265,
            "output_tokens": 6,
            "latency_ms": 703.6,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 265.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 675.9,
        "usd_sum": 4.8e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-it-002",
      "category": "pairing",
      "capability": "beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.beverage_types",
        "vocab.match_types"
      ],
      "prompt_template_hash": "30919f613b1fbcd6",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nScegli l'abbinamento più classico per l'ossobuco alla milanese fra: barolo, riesling, prosecco, lambrusco. Rispondi con una parola.\n\nINPUT:\n{\n  \"dish\": \"ossobuco alla milanese\"\n}\n\nANSWER FORMAT: exact:one of the given options",
      "expected": {
        "value": "barolo"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "barolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "barolo",
            "expected": "barolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 328,
            "output_tokens": 5,
            "latency_ms": 636.7,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "barolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "barolo",
            "expected": "barolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 328,
            "output_tokens": 5,
            "latency_ms": 609.8,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "barolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "barolo",
            "expected": "barolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 328,
            "output_tokens": 5,
            "latency_ms": 722.1,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 328.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 656.2,
        "usd_sum": 5.4e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-ja-001",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "easy",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "ff1f43723314705c",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nOGSの香りファミリー一覧から、柚子（ゆず）の主たる香りファミリーを一語で答えてください。\n\nINPUT:\n{\n  \"item\": \"柚子\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "fruit"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "fruit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fruit",
            "expected": "fruit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 332,
            "output_tokens": 4,
            "latency_ms": 579.8,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "fruit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fruit",
            "expected": "fruit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 332,
            "output_tokens": 4,
            "latency_ms": 586.1,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "fruit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fruit",
            "expected": "fruit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 332,
            "output_tokens": 4,
            "latency_ms": 588.2,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 332.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 584.7,
        "usd_sum": 5.4e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-ja-002",
      "category": "pairing",
      "capability": "beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.beverage_types",
        "vocab.match_types"
      ],
      "prompt_template_hash": "f68d4882814e0ecf",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\n握り寿司（赤身・白身・光物）のコース全体に最も安全に合う飲料のタイプを、次から一つ選んでください: sake, red_wine, stout_beer, bourbon.\n\nINPUT:\n{\n  \"dish\": \"握り寿司コース\"\n}\n\nANSWER FORMAT: exact:one of the given options",
      "expected": {
        "value": "sake"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "sake",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sake",
            "expected": "sake",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 338,
            "output_tokens": 5,
            "latency_ms": 591.3,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "sake",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sake",
            "expected": "sake",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 338,
            "output_tokens": 5,
            "latency_ms": 683.3,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "sake",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sake",
            "expected": "sake",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 338,
            "output_tokens": 5,
            "latency_ms": 749.3,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 338.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 674.633,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-001",
      "category": "nutrition",
      "capability": "energy_calculation",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "94fadc6f5c8c5968",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nA portion contains 40 g carbohydrate, 20 g protein, 15 g fat. Return total energy in kcal as a single integer, using the EU Annex XIV macro factors.\n\nINPUT:\n{\n  \"carb_g\": 40,\n  \"protein_g\": 20,\n  \"fat_g\": 15\n}\n\nANSWER FORMAT: numeric:integer kcal",
      "expected": {
        "value": 375,
        "tolerance": 5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "1180",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 1180.0,
            "expected": 375.0,
            "tolerance": 5.0
          },
          "metrics": {
            "input_tokens": 815,
            "output_tokens": 5,
            "latency_ms": 524.3,
            "usd": 4.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "567",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 567.0,
            "expected": 375.0,
            "tolerance": 5.0
          },
          "metrics": {
            "input_tokens": 815,
            "output_tokens": 4,
            "latency_ms": 540.2,
            "usd": 4.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "570",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 570.0,
            "expected": 375.0,
            "tolerance": 5.0
          },
          "metrics": {
            "input_tokens": 815,
            "output_tokens": 4,
            "latency_ms": 607.0,
            "usd": 4.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 815.0,
        "output_tokens_mean": 4.333,
        "latency_ms_mean": 557.167,
        "usd_sum": 0.000127
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-002",
      "category": "nutrition",
      "capability": "energy_calculation",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "4b141daaadd9af57",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nA 150 mL glass of red wine contains approximately 14 g alcohol, 4 g carbohydrate, no protein, no fat. Return energy in kcal as a single integer.\n\nINPUT:\n{\n  \"carb_g\": 4,\n  \"protein_g\": 0,\n  \"fat_g\": 0,\n  \"alcohol_g\": 14\n}\n\nANSWER FORMAT: numeric:integer kcal",
      "expected": {
        "value": 114,
        "tolerance": 5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "220",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 220.0,
            "expected": 114.0,
            "tolerance": 5.0
          },
          "metrics": {
            "input_tokens": 824,
            "output_tokens": 4,
            "latency_ms": 1083.9,
            "usd": 4.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "153",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 153.0,
            "expected": 114.0,
            "tolerance": 5.0
          },
          "metrics": {
            "input_tokens": 824,
            "output_tokens": 4,
            "latency_ms": 589.1,
            "usd": 4.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "1",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 1.0,
            "expected": 114.0,
            "tolerance": 5.0
          },
          "metrics": {
            "input_tokens": 824,
            "output_tokens": 4,
            "latency_ms": 588.0,
            "usd": 4.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 824.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 753.667,
        "usd_sum": 0.000129
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-003",
      "category": "nutrition",
      "capability": "per_100g_estimate",
      "difficulty": "medium",
      "cuisine_region": "mediterranean",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "518ee70ef096f9ec",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nA simple olive-oil vinaigrette is 75 g olive oil + 25 g red wine vinegar (vinegar ≈ 20 kcal/100 g). Return energy per 100 g of finished dressing as a single integer kcal.\n\nINPUT:\n{\n  \"olive_oil_g\": 75,\n  \"vinegar_g\": 25\n}\n\nANSWER FORMAT: numeric:integer kcal per 100 g",
      "expected": {
        "value": 668,
        "tolerance": 15
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "686",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 686.0,
            "expected": 668.0,
            "tolerance": 15.0
          },
          "metrics": {
            "input_tokens": 825,
            "output_tokens": 4,
            "latency_ms": 526.6,
            "usd": 4.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "558",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 558.0,
            "expected": 668.0,
            "tolerance": 15.0
          },
          "metrics": {
            "input_tokens": 825,
            "output_tokens": 4,
            "latency_ms": 507.7,
            "usd": 4.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "585",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 585.0,
            "expected": 668.0,
            "tolerance": 15.0
          },
          "metrics": {
            "input_tokens": 825,
            "output_tokens": 4,
            "latency_ms": 783.6,
            "usd": 4.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 825.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 605.967,
        "usd_sum": 0.000129
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-004",
      "category": "nutrition",
      "capability": "protein_estimate",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "42947503c82b308d",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nEstimate the protein (g) in a 180 g cooked portion of skinless chicken-breast filet assuming 30% cooking mass loss. Respond with one integer.\n\nINPUT:\n{\n  \"raw_mass_g_target_cooked\": 180,\n  \"cooking_loss\": 0.3\n}\n\nANSWER FORMAT: numeric:integer g",
      "expected": {
        "value": 59,
        "tolerance": 4
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "31",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 31.0,
            "expected": 59.0,
            "tolerance": 4.0
          },
          "metrics": {
            "input_tokens": 811,
            "output_tokens": 4,
            "latency_ms": 802.4,
            "usd": 4.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "44",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 44.0,
            "expected": 59.0,
            "tolerance": 4.0
          },
          "metrics": {
            "input_tokens": 811,
            "output_tokens": 4,
            "latency_ms": 593.6,
            "usd": 4.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "31",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 31.0,
            "expected": 59.0,
            "tolerance": 4.0
          },
          "metrics": {
            "input_tokens": 811,
            "output_tokens": 4,
            "latency_ms": 733.5,
            "usd": 4.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 811.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 709.833,
        "usd_sum": 0.000126
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-005",
      "category": "nutrition",
      "capability": "macro_vector",
      "difficulty": "hard",
      "cuisine_region": "italian",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "375ca5d44d346632",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nA portion contains 85 g dry pasta (75 C / 13 P / 1.5 F per 100 g), 10 g butter (0.1 C / 0.9 P / 81 F per 100 g) and 15 g parmesan (3.2 C / 35 P / 29 F per 100 g). Return [carb_g, protein_g, fat_g, kcal] for the portion as a JSON array of four numbers.\n\nINPUT:\n{\n  \"pasta_g\": 85,\n  \"butter_g\": 10,\n  \"parmesan_g\": 15\n}\n\nANSWER FORMAT: json:array [carb, protein, fat, kcal]",
      "expected": {
        "values": [
          64.3,
          16.3,
          13.7,
          446
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[78.06,17.95,8.54,552]",
          "compliant": true,
          "score": 0.1125,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 78.06,
                "expected": 64.3,
                "score": 0.0
              },
              {
                "index": 1,
                "predicted": 17.95,
                "expected": 16.3,
                "score": 0.45
              },
              {
                "index": 2,
                "predicted": 8.54,
                "expected": 13.7,
                "score": 0.0
              },
              {
                "index": 3,
                "predicted": 552,
                "expected": 446,
                "score": 0.0
              }
            ],
            "tolerance": 3.0
          },
          "metrics": {
            "input_tokens": 889,
            "output_tokens": 18,
            "latency_ms": 552.6,
            "usd": 5.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[119.5, 21.8, 8.8, 668]",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 119.5,
                "expected": 64.3,
                "score": 0.0
              },
              {
                "index": 1,
                "predicted": 21.8,
                "expected": 16.3,
                "score": 0.0
              },
              {
                "index": 2,
                "predicted": 8.8,
                "expected": 13.7,
                "score": 0.0
              },
              {
                "index": 3,
                "predicted": 668,
                "expected": 446,
                "score": 0.0
              }
            ],
            "tolerance": 3.0
          },
          "metrics": {
            "input_tokens": 889,
            "output_tokens": 21,
            "latency_ms": 681.9,
            "usd": 5.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[89.7, 17.875, 15.75, 595]",
          "compliant": true,
          "score": 0.1979,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 89.7,
                "expected": 64.3,
                "score": 0.0
              },
              {
                "index": 1,
                "predicted": 17.875,
                "expected": 16.3,
                "score": 0.475
              },
              {
                "index": 2,
                "predicted": 15.75,
                "expected": 13.7,
                "score": 0.3167
              },
              {
                "index": 3,
                "predicted": 595,
                "expected": 446,
                "score": 0.0
              }
            ],
            "tolerance": 3.0
          },
          "metrics": {
            "input_tokens": 889,
            "output_tokens": 21,
            "latency_ms": 1671.3,
            "usd": 5.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.1035,
      "score_summary": {
        "mean": 0.1035,
        "stdev": 0.0993,
        "sem": 0.0573,
        "ci95_lo": -0.0089,
        "ci95_hi": 0.2158,
        "n": 3
      },
      "score_if_valid": 0.1035,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 889.0,
        "output_tokens_mean": 20.0,
        "latency_ms_mean": 968.6,
        "usd_sum": 0.000158
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-006",
      "category": "nutrition",
      "capability": "energy_calculation",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "1ba6175ed99ad486",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nA mocktail uses 180 mL fresh orange juice (approx 9 g carbohydrate per 100 g) plus 12 g granulated sugar. Return kcal as a single integer.\n\nINPUT:\n{\n  \"oj_ml\": 180,\n  \"sugar_g\": 12\n}\n\nANSWER FORMAT: numeric:integer kcal",
      "expected": {
        "value": 111,
        "tolerance": 6
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "104",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 104.0,
            "expected": 111.0,
            "tolerance": 6.0
          },
          "metrics": {
            "input_tokens": 808,
            "output_tokens": 4,
            "latency_ms": 586.9,
            "usd": 4.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "240",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 240.0,
            "expected": 111.0,
            "tolerance": 6.0
          },
          "metrics": {
            "input_tokens": 808,
            "output_tokens": 4,
            "latency_ms": 773.1,
            "usd": 4.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "numeric:integer kcal 224",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 224.0,
            "expected": 111.0,
            "tolerance": 6.0
          },
          "metrics": {
            "input_tokens": 808,
            "output_tokens": 9,
            "latency_ms": 624.6,
            "usd": 4.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 808.0,
        "output_tokens_mean": 5.667,
        "latency_ms_mean": 661.533,
        "usd_sum": 0.000128
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-007",
      "category": "nutrition",
      "capability": "fibre_rule",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "b9e5625ed0828c94",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nWhat kcal/g factor does the OGS nutrition reference assign to dietary fibre? Respond with a single number.\n\nANSWER FORMAT: numeric:decimal kcal/g",
      "expected": {
        "value": 2,
        "tolerance": 0.1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "2",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.0,
            "expected": 2.0,
            "tolerance": 0.1
          },
          "metrics": {
            "input_tokens": 772,
            "output_tokens": 4,
            "latency_ms": 496.5,
            "usd": 4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "2",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.0,
            "expected": 2.0,
            "tolerance": 0.1
          },
          "metrics": {
            "input_tokens": 772,
            "output_tokens": 4,
            "latency_ms": 547.0,
            "usd": 4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "2",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.0,
            "expected": 2.0,
            "tolerance": 0.1
          },
          "metrics": {
            "input_tokens": 772,
            "output_tokens": 4,
            "latency_ms": 567.7,
            "usd": 4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 772.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 537.067,
        "usd_sum": 0.00012
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-008",
      "category": "nutrition",
      "capability": "kj_conversion",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "95ef839d58e96bbc",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nConvert 250 kcal to kJ using the canonical OGS/EU conversion factor. Respond with a single integer.\n\nINPUT:\n{\n  \"kcal\": 250\n}\n\nANSWER FORMAT: numeric:integer kJ",
      "expected": {
        "value": 1046,
        "tolerance": 10
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "1046",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 1046.0,
            "expected": 1046.0,
            "tolerance": 10.0
          },
          "metrics": {
            "input_tokens": 786,
            "output_tokens": 5,
            "latency_ms": 473.0,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "1045",
          "compliant": true,
          "score": 0.9,
          "detail": {
            "predicted": 1045.0,
            "expected": 1046.0,
            "tolerance": 10.0
          },
          "metrics": {
            "input_tokens": 786,
            "output_tokens": 5,
            "latency_ms": 624.6,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "1047",
          "compliant": true,
          "score": 0.9,
          "detail": {
            "predicted": 1047.0,
            "expected": 1046.0,
            "tolerance": 10.0
          },
          "metrics": {
            "input_tokens": 786,
            "output_tokens": 5,
            "latency_ms": 615.6,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.9333,
      "score_summary": {
        "mean": 0.9333,
        "stdev": 0.0577,
        "sem": 0.0333,
        "ci95_lo": 0.868,
        "ci95_hi": 0.9987,
        "n": 3
      },
      "score_if_valid": 0.9333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 786.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 571.067,
        "usd_sum": 0.000123
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-009",
      "category": "nutrition",
      "capability": "macro_vector",
      "difficulty": "hard",
      "cuisine_region": "italian",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "641416c8b33c6dbc",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nA simple Alfredo sauce is 100 g butter + 150 g heavy cream (36% fat: 2.8 C / 2 P / 37 F) + 60 g parmesan (3.2 C / 35 P / 29 F), reduced to 250 g finished weight. Return [carb_per100g, protein_per100g, fat_per100g, kcal_per100g] as a JSON array of four numbers.\n\nINPUT:\n{\n  \"butter_g\": 100,\n  \"cream_g\": 150,\n  \"parmesan_g\": 60,\n  \"finished_g\": 250\n}\n\nANSWER FORMAT: json:array [carb, protein, fat, kcal] per 100 g",
      "expected": {
        "values": [
          2.5,
          9.6,
          55.2,
          545
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[6.0,18.0,35.8,338]",
          "compliant": true,
          "score": 0.0312,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 6.0,
                "expected": 2.5,
                "score": 0.125
              },
              {
                "index": 1,
                "predicted": 18.0,
                "expected": 9.6,
                "score": 0.0
              },
              {
                "index": 2,
                "predicted": 35.8,
                "expected": 55.2,
                "score": 0.0
              },
              {
                "index": 3,
                "predicted": 338,
                "expected": 545,
                "score": 0.0
              }
            ],
            "tolerance": 4.0
          },
          "metrics": {
            "input_tokens": 894,
            "output_tokens": 18,
            "latency_ms": 746.7,
            "usd": 5.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[5, 15, 38, 384]",
          "compliant": true,
          "score": 0.0938,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 5,
                "expected": 2.5,
                "score": 0.375
              },
              {
                "index": 1,
                "predicted": 15,
                "expected": 9.6,
                "score": 0.0
              },
              {
                "index": 2,
                "predicted": 38,
                "expected": 55.2,
                "score": 0.0
              },
              {
                "index": 3,
                "predicted": 384,
                "expected": 545,
                "score": 0.0
              }
            ],
            "tolerance": 4.0
          },
          "metrics": {
            "input_tokens": 894,
            "output_tokens": 15,
            "latency_ms": 470.3,
            "usd": 5.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[6,19,69,636]",
          "compliant": true,
          "score": 0.0312,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 6,
                "expected": 2.5,
                "score": 0.125
              },
              {
                "index": 1,
                "predicted": 19,
                "expected": 9.6,
                "score": 0.0
              },
              {
                "index": 2,
                "predicted": 69,
                "expected": 55.2,
                "score": 0.0
              },
              {
                "index": 3,
                "predicted": 636,
                "expected": 545,
                "score": 0.0
              }
            ],
            "tolerance": 4.0
          },
          "metrics": {
            "input_tokens": 894,
            "output_tokens": 12,
            "latency_ms": 1120.8,
            "usd": 5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0521,
      "score_summary": {
        "mean": 0.0521,
        "stdev": 0.0361,
        "sem": 0.0209,
        "ci95_lo": 0.0112,
        "ci95_hi": 0.093,
        "n": 3
      },
      "score_if_valid": 0.0521,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 894.0,
        "output_tokens_mean": 15.0,
        "latency_ms_mean": 779.267,
        "usd_sum": 0.000153
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-010",
      "category": "nutrition",
      "capability": "numeric_compare",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "fd7edc20921469ff",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nFrom the set {olive oil, butter, granulated sugar, dry pasta}, return the ingredient with the highest kcal per gram. Respond with one short name.\n\nINPUT:\n{\n  \"set\": [\n    \"olive oil\",\n    \"butter\",\n    \"granulated sugar\",\n    \"dry pasta\"\n  ]\n}\n\nANSWER FORMAT: exact:string",
      "expected": {
        "value": "olive oil",
        "aliases": [
          "olive-oil"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "olive oil",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "olive oil",
            "expected": "olive oil",
            "aliases": [
              "olive-oil"
            ]
          },
          "metrics": {
            "input_tokens": 812,
            "output_tokens": 5,
            "latency_ms": 480.4,
            "usd": 4.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "olive oil",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "olive oil",
            "expected": "olive oil",
            "aliases": [
              "olive-oil"
            ]
          },
          "metrics": {
            "input_tokens": 812,
            "output_tokens": 5,
            "latency_ms": 543.3,
            "usd": 4.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "olive oil",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "olive oil",
            "expected": "olive oil",
            "aliases": [
              "olive-oil"
            ]
          },
          "metrics": {
            "input_tokens": 812,
            "output_tokens": 5,
            "latency_ms": 703.3,
            "usd": 4.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 812.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 575.667,
        "usd_sum": 0.000129
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-001",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "easy",
      "cuisine_region": "american",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "3fa6a67d5bfe7f1a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage option for the dish below from the candidates. Respond with just the candidate id (e.g. 'a').\n\nINPUT:\n{\n  \"dish\": \"60-day dry-aged ribeye, medium-rare, served with peppercorn jus\",\n  \"candidates\": {\n    \"a\": \"young Barolo DOCG (high tannin, high acidity, full body)\",\n    \"b\": \"off-dry German Kabinett Riesling\",\n    \"c\": \"unoaked Muscadet sur lie\",\n    \"d\": \"sweet Pedro Ximenez sherry\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "b",
          "d",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 377,
            "output_tokens": 4,
            "latency_ms": 513.6,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 377,
            "output_tokens": 4,
            "latency_ms": 644.9,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 377,
            "output_tokens": 4,
            "latency_ms": 590.1,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 377.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 582.867,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-002",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "4b85899cd6206acb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below from the candidates. Respond with just the candidate id.\n\nINPUT:\n{\n  \"dish\": \"fresh Kumamoto oysters on the half shell, lemon and mignonette\",\n  \"candidates\": {\n    \"a\": \"oaked California Chardonnay (14.5% ABV, buttery)\",\n    \"b\": \"Muscadet Sevre-et-Maine sur lie\",\n    \"c\": \"Amarone della Valpolicella\",\n    \"d\": \"bourbon old fashioned\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "b",
        "ranking": [
          "b",
          "a",
          "d",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 364,
            "output_tokens": 4,
            "latency_ms": 604.0,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 364,
            "output_tokens": 4,
            "latency_ms": 622.5,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 364,
            "output_tokens": 4,
            "latency_ms": 615.3,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 364.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 613.933,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-003",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "mexican",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "41ed3ee840823502",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below. Respond with just the candidate id.\n\nINPUT:\n{\n  \"dish\": \"classic Chengdu-style mapo tofu (heavy doubanjiang, Sichuan peppercorn, ground pork, high chilli heat)\",\n  \"candidates\": {\n    \"a\": \"15% ABV Zinfandel\",\n    \"b\": \"off-dry Riesling Kabinett (~10% ABV)\",\n    \"c\": \"Islay single malt Scotch (Lagavulin 16)\",\n    \"d\": \"Brunello di Montalcino Riserva\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "b",
        "ranking": [
          "b",
          "d",
          "a",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 377,
            "output_tokens": 4,
            "latency_ms": 553.1,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 377,
            "output_tokens": 4,
            "latency_ms": 668.5,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 377,
            "output_tokens": 4,
            "latency_ms": 581.8,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 377.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 601.133,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-004",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "82a4b41d0e3d5668",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below. Respond with just the candidate id.\n\nINPUT:\n{\n  \"dish\": \"warm goat cheese crouton on mesclun with walnut vinaigrette\",\n  \"candidates\": {\n    \"a\": \"Sancerre (Sauvignon Blanc, Loire)\",\n    \"b\": \"Napa Cabernet Sauvignon Reserve\",\n    \"c\": \"Vintage Port\",\n    \"d\": \"lagered pilsner\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "d",
          "b",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 348,
            "output_tokens": 4,
            "latency_ms": 620.7,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 348,
            "output_tokens": 4,
            "latency_ms": 541.7,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 348,
            "output_tokens": 4,
            "latency_ms": 637.5,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 348.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 599.967,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-005",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "441705237fe65e54",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below. Respond with just the candidate id.\n\nINPUT:\n{\n  \"dish\": \"warm dark chocolate fondant (70% cacao), molten centre, vanilla ice cream\",\n  \"candidates\": {\n    \"a\": \"dry Sancerre Sauvignon Blanc\",\n    \"b\": \"vintage Port\",\n    \"c\": \"bone-dry Brut Nature Champagne\",\n    \"d\": \"Muscadet Sevre-et-Maine\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "b",
        "ranking": [
          "b",
          "c",
          "a",
          "d"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 351,
            "output_tokens": 4,
            "latency_ms": 573.7,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 351,
            "output_tokens": 4,
            "latency_ms": 533.6,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 351,
            "output_tokens": 4,
            "latency_ms": 626.4,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 351.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 577.9,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-006",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "b195ee1dfe7968ab",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage to pair across a premium edomae sushi omakase. Respond with just the candidate id.\n\nINPUT:\n{\n  \"dish\": \"premium edomae sushi omakase (akami, toro, kohada, ikura, uni, anago)\",\n  \"candidates\": {\n    \"a\": \"Junmai Daiginjo sake (well-chilled)\",\n    \"b\": \"heavily oaked California Chardonnay\",\n    \"c\": \"Barolo Riserva\",\n    \"d\": \"Islay single malt\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "b",
          "d",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 369,
            "output_tokens": 4,
            "latency_ms": 553.6,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 369,
            "output_tokens": 4,
            "latency_ms": 515.3,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 369,
            "output_tokens": 4,
            "latency_ms": 603.6,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 369.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 557.5,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-007",
      "category": "pairing",
      "capability": "regional_pairing",
      "difficulty": "hard",
      "cuisine_region": "spanish",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "81bf14f26f4fa761",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for a classic paella valenciana (rabbit, chicken, garrofó beans, saffron). Respond with just the candidate id.\n\nINPUT:\n{\n  \"dish\": \"authentic paella valenciana\",\n  \"candidates\": {\n    \"a\": \"young Rioja crianza (Tempranillo blend, light oak)\",\n    \"b\": \"Amarone della Valpolicella Riserva\",\n    \"c\": \"Islay peated single malt\",\n    \"d\": \"off-dry Moscato d'Asti\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "b",
          "d",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 4,
            "latency_ms": 416.5,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 4,
            "latency_ms": 748.6,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 4,
            "latency_ms": 574.9,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 370.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 580.0,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-008",
      "category": "pairing",
      "capability": "pairing_grade_reasoning",
      "difficulty": "hard",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.grade_scale"
      ],
      "prompt_template_hash": "51ad14586a063779",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n---\n\nA pairing's overall_score is computed as 68. Using the OGS grade scale, return the letter grade (ogs-match.md §5). Respond with just the grade token.\n\nINPUT:\n{\n  \"overall_score\": 68\n}\n\nANSWER FORMAT: one_of:A+,A,B+,B,C+,C,D,F",
      "expected": {
        "value": "D"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "C",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "C",
            "expected": "D",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 383,
            "output_tokens": 4,
            "latency_ms": 477.3,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "C",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "C",
            "expected": "D",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 383,
            "output_tokens": 4,
            "latency_ms": 522.2,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "C",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "C",
            "expected": "D",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 383,
            "output_tokens": 4,
            "latency_ms": 545.6,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 383.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 515.033,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-009",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "easy",
      "cuisine_region": "italian",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "d7ade64ce48bfbe4",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below from the candidates. Respond with just the candidate id (e.g. 'a').\n\nINPUT:\n{\n  \"dish\": \"classic Neapolitan pizza Margherita (tomato, buffalo mozzarella, basil)\",\n  \"candidates\": {\n    \"a\": \"bright Chianti Classico (Sangiovese, high acid, medium tannin)\",\n    \"b\": \"heavily oaked California Chardonnay\",\n    \"c\": \"sweet ice wine\",\n    \"d\": \"Islay peated whisky\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "b",
          "d",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 4,
            "latency_ms": 572.6,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 4,
            "latency_ms": 603.1,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 4,
            "latency_ms": 595.2,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 370.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 590.3,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-010",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "chinese",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "637717fdf3577204",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below.\n\nINPUT:\n{\n  \"dish\": \"mapo tofu with doubanjiang and Sichuan peppercorn, medium-hot\",\n  \"candidates\": {\n    \"a\": \"off-dry German Riesling Spätlese\",\n    \"b\": \"high-alcohol (15%) Zinfandel\",\n    \"c\": \"bone-dry Fino sherry\",\n    \"d\": \"straight rye whiskey\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "c",
          "d",
          "b"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 349,
            "output_tokens": 4,
            "latency_ms": 603.8,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 349,
            "output_tokens": 4,
            "latency_ms": 501.5,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 349,
            "output_tokens": 4,
            "latency_ms": 553.7,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 349.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 553.0,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-011",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "indian",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "2b655f8b1ea64df8",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below.\n\nINPUT:\n{\n  \"dish\": \"butter chicken (murgh makhani) in rich tomato-cream sauce with fenugreek\",\n  \"candidates\": {\n    \"a\": \"off-dry Gewurztraminer\",\n    \"b\": \"young tannic Cabernet Sauvignon\",\n    \"c\": \"unoaked Muscadet\",\n    \"d\": \"Islay single malt\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "c",
          "b",
          "d"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 345,
            "output_tokens": 4,
            "latency_ms": 579.2,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 345,
            "output_tokens": 4,
            "latency_ms": 475.1,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 345,
            "output_tokens": 4,
            "latency_ms": 626.7,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 345.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 560.333,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-012",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "ethiopian",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "3fe344c62017694c",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish.\n\nINPUT:\n{\n  \"dish\": \"doro wat: chicken stewed in berbere-spiced ghee, served with injera\",\n  \"candidates\": {\n    \"a\": \"juicy, low-tannin Beaujolais (Gamay) served slightly chilled\",\n    \"b\": \"high-alcohol Amarone\",\n    \"c\": \"bone-dry sparkling Prosecco Brut\",\n    \"d\": \"heavily peated Islay whisky\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "c",
          "b",
          "d"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 360,
            "output_tokens": 4,
            "latency_ms": 642.9,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 360,
            "output_tokens": 4,
            "latency_ms": 500.3,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "c",
          "compliant": true,
          "score": 0.15,
          "detail": {
            "predicted": "c",
            "expected_best": "a",
            "rank": 1
          },
          "metrics": {
            "input_tokens": 360,
            "output_tokens": 4,
            "latency_ms": 576.9,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.7167,
      "score_summary": {
        "mean": 0.7167,
        "stdev": 0.4907,
        "sem": 0.2833,
        "ci95_lo": 0.1613,
        "ci95_hi": 1.272,
        "n": 3
      },
      "score_if_valid": 0.7167,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 360.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 573.367,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-013",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "argentinian",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "4d30c6e439c9c85f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage.\n\nINPUT:\n{\n  \"dish\": \"grilled ribeye over wood embers, served with chimichurri\",\n  \"candidates\": {\n    \"a\": \"old-vine Mendoza Malbec, medium-plus tannin, black fruit\",\n    \"b\": \"unoaked Chablis\",\n    \"c\": \"off-dry Moscato\",\n    \"d\": \"dry Manzanilla sherry\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "d",
          "b",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 346,
            "output_tokens": 4,
            "latency_ms": 573.8,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 346,
            "output_tokens": 4,
            "latency_ms": 658.8,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 346,
            "output_tokens": 4,
            "latency_ms": 571.7,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 346.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 601.433,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-014",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "hard",
      "cuisine_region": "japanese",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "848f37f91777acfb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for otoro nigiri topped with a brush of soy.\n\nINPUT:\n{\n  \"dish\": \"bluefin tuna otoro nigiri, brushed soy\",\n  \"candidates\": {\n    \"a\": \"junmai daiginjo sake, chilled\",\n    \"b\": \"young Chinon Cabernet Franc\",\n    \"c\": \"dry Riesling Kabinett\",\n    \"d\": \"sweet Sauternes\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "c",
          "b",
          "d"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "c",
          "compliant": true,
          "score": 0.15,
          "detail": {
            "predicted": "c",
            "expected_best": "a",
            "rank": 1
          },
          "metrics": {
            "input_tokens": 345,
            "output_tokens": 4,
            "latency_ms": 909.4,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "c",
          "compliant": true,
          "score": 0.15,
          "detail": {
            "predicted": "c",
            "expected_best": "a",
            "rank": 1
          },
          "metrics": {
            "input_tokens": 345,
            "output_tokens": 4,
            "latency_ms": 518.8,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 345,
            "output_tokens": 4,
            "latency_ms": 696.5,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.4333,
      "score_summary": {
        "mean": 0.4333,
        "stdev": 0.4907,
        "sem": 0.2833,
        "ci95_lo": -0.122,
        "ci95_hi": 0.9887,
        "n": 3
      },
      "score_if_valid": 0.4333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 345.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 708.233,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-015",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "hard",
      "cuisine_region": "moroccan",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "f5ecf9dcd59f0d0a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage.\n\nINPUT:\n{\n  \"dish\": \"slow-cooked lamb shoulder tagine with preserved lemon and green olives\",\n  \"candidates\": {\n    \"a\": \"Southern Rhône Grenache-Syrah blend (Gigondas), moderate tannin\",\n    \"b\": \"unoaked Muscadet\",\n    \"c\": \"Champagne Blanc de Blancs\",\n    \"d\": \"young high-tannin Barolo\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "d",
          "c",
          "b"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 346,
            "output_tokens": 4,
            "latency_ms": 938.6,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 346,
            "output_tokens": 4,
            "latency_ms": 685.4,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 346,
            "output_tokens": 4,
            "latency_ms": 816.0,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 346.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 813.333,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-016",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "vietnamese",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "3dd7c0170877ac9b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for a bowl of Vietnamese beef pho.\n\nINPUT:\n{\n  \"dish\": \"pho bo: beef broth with star anise, cinnamon, rice noodles, rare beef, basil, lime\",\n  \"candidates\": {\n    \"a\": \"cold Vietnamese lager-style beer (e.g. Saigon or 333)\",\n    \"b\": \"tannic Cabernet Sauvignon\",\n    \"c\": \"heavily oaked Chardonnay\",\n    \"d\": \"sweet Tokaji Aszú 5 puttonyos\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "b",
          "c",
          "d"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 365,
            "output_tokens": 4,
            "latency_ms": 641.9,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 365,
            "output_tokens": 4,
            "latency_ms": 592.2,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 365,
            "output_tokens": 4,
            "latency_ms": 589.6,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 365.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 607.9,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-017",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "8f60336584f33baf",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for 24-month aged Comté served on its own.\n\nINPUT:\n{\n  \"dish\": \"24-month aged Comté, served at room temperature\",\n  \"candidates\": {\n    \"a\": \"Jura Vin Jaune (Savagnin, oxidative, nutty)\",\n    \"b\": \"young oaky Cabernet Sauvignon\",\n    \"c\": \"dry Muscadet\",\n    \"d\": \"Sauternes\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "d",
          "b",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 348,
            "output_tokens": 4,
            "latency_ms": 716.3,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 348,
            "output_tokens": 4,
            "latency_ms": 1180.4,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 348,
            "output_tokens": 4,
            "latency_ms": 538.5,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 348.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 811.733,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-018",
      "category": "pairing",
      "capability": "regional_pairing",
      "difficulty": "medium",
      "cuisine_region": "thai",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "9209873bc05779b7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for a fragrant, moderately spicy Thai green curry with chicken and Thai basil.\n\nINPUT:\n{\n  \"dish\": \"Thai green curry with chicken, Thai basil, kaffir lime\",\n  \"candidates\": {\n    \"a\": \"off-dry New Zealand Sauvignon Blanc with residual sugar\",\n    \"b\": \"heavy Port wine\",\n    \"c\": \"young tannic Bordeaux\",\n    \"d\": \"Islay peated whisky\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "c",
          "b",
          "d"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 349,
            "output_tokens": 4,
            "latency_ms": 550.7,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 349,
            "output_tokens": 4,
            "latency_ms": 508.2,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 349,
            "output_tokens": 4,
            "latency_ms": 688.9,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 349.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 582.6,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-019",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "mexican",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "da735293a9d3102b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage.\n\nINPUT:\n{\n  \"dish\": \"Oaxacan mole negro enchiladas (rich chocolate-chile-spice sauce over chicken)\",\n  \"candidates\": {\n    \"a\": \"medium-tannin Garnacha from Aragón\",\n    \"b\": \"unoaked Muscadet\",\n    \"c\": \"very sweet late-harvest Riesling\",\n    \"d\": \"bone-dry Brut Champagne\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "d",
          "c",
          "b"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 344,
            "output_tokens": 4,
            "latency_ms": 521.7,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 344,
            "output_tokens": 4,
            "latency_ms": 505.0,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 344,
            "output_tokens": 4,
            "latency_ms": 637.9,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 344.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 554.867,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-020",
      "category": "pairing",
      "capability": "pairing_grade_reasoning",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.grade_scale"
      ],
      "prompt_template_hash": "bac251ebb1c064ef",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n---\n\nA pairing's overall_score is computed as 96. Using the OGS grade scale, return the letter grade.\n\nINPUT:\n{\n  \"overall_score\": 96\n}\n\nANSWER FORMAT: one_of:A+,A,B+,B,C+,C,D,F",
      "expected": {
        "value": "A+"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "A+",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a+",
            "expected": "a+",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 5,
            "latency_ms": 695.4,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "A+",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a+",
            "expected": "a+",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 5,
            "latency_ms": 656.6,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "A+",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a+",
            "expected": "a+",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 5,
            "latency_ms": 627.2,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 370.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 659.733,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-001",
      "category": "regional",
      "capability": "wine_region_knowledge",
      "difficulty": "easy",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "1df867dd3a55af5b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhat is the ONLY grape variety permitted in Barolo DOCG? Respond with just the variety name.\n\nANSWER FORMAT: string",
      "expected": {
        "value": "nebbiolo"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Nebbiolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "nebbiolo",
            "expected": "nebbiolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 245,
            "output_tokens": 6,
            "latency_ms": 1256.7,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Nebbiolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "nebbiolo",
            "expected": "nebbiolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 245,
            "output_tokens": 6,
            "latency_ms": 727.6,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Nebbiolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "nebbiolo",
            "expected": "nebbiolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 245,
            "output_tokens": 6,
            "latency_ms": 757.5,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 245.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 913.933,
        "usd_sum": 4.5e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-002",
      "category": "regional",
      "capability": "wine_region_knowledge",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "24a95f8a1c0a0a45",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhat single grape variety is used for Chablis AOC? Respond with just the variety name.\n\nANSWER FORMAT: string",
      "expected": {
        "value": "chardonnay"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Chardonnay",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "chardonnay",
            "expected": "chardonnay",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 245,
            "output_tokens": 5,
            "latency_ms": 594.5,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Chardonnay",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "chardonnay",
            "expected": "chardonnay",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 245,
            "output_tokens": 5,
            "latency_ms": 601.4,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Chardonnay",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "chardonnay",
            "expected": "chardonnay",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 245,
            "output_tokens": 5,
            "latency_ms": 630.7,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 245.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 608.867,
        "usd_sum": 4.2e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-003",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "easy",
      "cuisine_region": "vietnamese",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "15d36148f8b853e6",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhich country is phở originally from? Respond with just the country name.\n\nANSWER FORMAT: string",
      "expected": {
        "value": "vietnam"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Vietnam",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "vietnam",
            "expected": "vietnam",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 240,
            "output_tokens": 4,
            "latency_ms": 711.0,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Vietnam",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "vietnam",
            "expected": "vietnam",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 240,
            "output_tokens": 4,
            "latency_ms": 588.5,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Vietnam",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "vietnam",
            "expected": "vietnam",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 240,
            "output_tokens": 4,
            "latency_ms": 576.4,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 240.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 625.3,
        "usd_sum": 4.2e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-004",
      "category": "regional",
      "capability": "classic_regional_pairing",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "e332af036a2bd591",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nName the classic Alsace wine traditionally paired with ripe, stinky Muenster cheese. One word, grape variety only.\n\nANSWER FORMAT: string",
      "expected": {
        "value": "gewurztraminer"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Gewürztraminer",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "gewürztraminer",
            "expected": "gewurztraminer",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 250,
            "output_tokens": 8,
            "latency_ms": 678.5,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Gewürztraminer",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "gewürztraminer",
            "expected": "gewurztraminer",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 250,
            "output_tokens": 8,
            "latency_ms": 703.2,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Riesling",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "riesling",
            "expected": "gewurztraminer",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 250,
            "output_tokens": 6,
            "latency_ms": 525.9,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 250.0,
        "output_tokens_mean": 7.333,
        "latency_ms_mean": 635.867,
        "usd_sum": 4.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-005",
      "category": "regional",
      "capability": "french_classical_cuisine",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "4aa2c63ffdeeacfb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nName the French mother sauce that is made from a brown roux, brown stock and mirepoix, traditionally enriched with tomato. One word, lowercase.\n\nANSWER FORMAT: string",
      "expected": {
        "value": "espagnole"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "espagnole",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "espagnole",
            "expected": "espagnole",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 256,
            "output_tokens": 6,
            "latency_ms": 532.8,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "espagnole",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "espagnole",
            "expected": "espagnole",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 256,
            "output_tokens": 6,
            "latency_ms": 691.9,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "espagnole",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "espagnole",
            "expected": "espagnole",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 256,
            "output_tokens": 6,
            "latency_ms": 600.8,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 256.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 608.5,
        "usd_sum": 4.5e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-006",
      "category": "regional",
      "capability": "wine_region_knowledge",
      "difficulty": "medium",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "979e7579b34d9797",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhat is the name of the official Rioja classification for a red wine aged at least 2 years (1 in oak) and 3+ years total? Respond with just the term.\n\nINPUT:\n{\n  \"question\": \"official Rioja classification for red wine aged 2+ years (1+ in oak), 3+ total\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "reserva"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Crianza",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "crianza",
            "expected": "reserva",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 294,
            "output_tokens": 6,
            "latency_ms": 671.2,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Reserva",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "reserva",
            "expected": "reserva",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 294,
            "output_tokens": 4,
            "latency_ms": 836.0,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Crianza",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "crianza",
            "expected": "reserva",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 294,
            "output_tokens": 6,
            "latency_ms": 676.8,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.3333,
      "score_summary": {
        "mean": 0.3333,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": -0.32,
        "ci95_hi": 0.9867,
        "n": 3
      },
      "score_if_valid": 0.3333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 294.0,
        "output_tokens_mean": 5.333,
        "latency_ms_mean": 728.0,
        "usd_sum": 5e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-007",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "easy",
      "cuisine_region": "korean",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "23a69016b1082553",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nFrom which country does kimchi originate? Respond with just the country name in English.\n\nINPUT:\n{\n  \"dish\": \"kimchi\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "korea",
        "aliases": [
          "south korea",
          "republic of korea"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "South Korea",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "south korea",
            "expected": "korea",
            "aliases": [
              "south korea",
              "republic of korea"
            ]
          },
          "metrics": {
            "input_tokens": 255,
            "output_tokens": 5,
            "latency_ms": 548.3,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Korea",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "korea",
            "expected": "korea",
            "aliases": [
              "south korea",
              "republic of korea"
            ]
          },
          "metrics": {
            "input_tokens": 255,
            "output_tokens": 5,
            "latency_ms": 577.5,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Korea",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "korea",
            "expected": "korea",
            "aliases": [
              "south korea",
              "republic of korea"
            ]
          },
          "metrics": {
            "input_tokens": 255,
            "output_tokens": 5,
            "latency_ms": 502.8,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 255.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 542.867,
        "usd_sum": 4.5e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-008",
      "category": "regional",
      "capability": "wine_region_knowledge",
      "difficulty": "hard",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "0a12eb057eb17654",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhat is the single grape variety required for Barbaresco DOCG?\n\nINPUT:\n{\n  \"wine\": \"Barbaresco DOCG\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "nebbiolo"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Nebbiolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "nebbiolo",
            "expected": "nebbiolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 255,
            "output_tokens": 6,
            "latency_ms": 619.6,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Nebbiolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "nebbiolo",
            "expected": "nebbiolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 255,
            "output_tokens": 6,
            "latency_ms": 710.8,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Nebbiolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "nebbiolo",
            "expected": "nebbiolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 255,
            "output_tokens": 6,
            "latency_ms": 609.6,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 255.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 646.667,
        "usd_sum": 4.5e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-009",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "medium",
      "cuisine_region": "peruvian",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "767d178c3cce3333",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhich country is classic ceviche most commonly attributed to? Respond with the country name.\n\nINPUT:\n{\n  \"dish\": \"ceviche\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "peru"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Peru",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "peru",
            "expected": "peru",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 255,
            "output_tokens": 5,
            "latency_ms": 753.7,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Peru",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "peru",
            "expected": "peru",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 255,
            "output_tokens": 5,
            "latency_ms": 525.9,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Peru",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "peru",
            "expected": "peru",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 255,
            "output_tokens": 5,
            "latency_ms": 629.4,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 255.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 636.333,
        "usd_sum": 4.5e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-010",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "easy",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "1001169bc92aaacd",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nIn which country does paella valenciana originate? Respond with the country name.\n\nINPUT:\n{\n  \"dish\": \"paella valenciana\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "spain"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Spain",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "spain",
            "expected": "spain",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 256,
            "output_tokens": 4,
            "latency_ms": 539.5,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Spain",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "spain",
            "expected": "spain",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 256,
            "output_tokens": 4,
            "latency_ms": 511.6,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Spain",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "spain",
            "expected": "spain",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 256,
            "output_tokens": 4,
            "latency_ms": 694.3,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 256.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 581.8,
        "usd_sum": 4.2e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-011",
      "category": "regional",
      "capability": "classic_regional_pairing",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "571e5074e66e6a11",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nName the classic French white wine region traditionally paired with raw oysters. Respond with just the region name.\n\nINPUT:\n{\n  \"dish\": \"raw oysters on the half shell\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "chablis",
        "aliases": [
          "muscadet"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Champagne",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "champagne",
            "expected": "chablis",
            "aliases": [
              "muscadet"
            ]
          },
          "metrics": {
            "input_tokens": 262,
            "output_tokens": 5,
            "latency_ms": 779.3,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Chablis",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "chablis",
            "expected": "chablis",
            "aliases": [
              "muscadet"
            ]
          },
          "metrics": {
            "input_tokens": 262,
            "output_tokens": 6,
            "latency_ms": 593.0,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Chablis",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "chablis",
            "expected": "chablis",
            "aliases": [
              "muscadet"
            ]
          },
          "metrics": {
            "input_tokens": 262,
            "output_tokens": 6,
            "latency_ms": 509.9,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": 0.0133,
        "ci95_hi": 1.32,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 262.0,
        "output_tokens_mean": 5.667,
        "latency_ms_mean": 627.4,
        "usd_sum": 4.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-012",
      "category": "regional",
      "capability": "french_classical_cuisine",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "d9792be4d5ca7335",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nName the French mother sauce made from clarified butter, egg yolks, and lemon juice.\n\nINPUT:\n{\n  \"description\": \"emulsion of clarified butter, egg yolks, and lemon juice\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "hollandaise"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Hollandaise",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "hollandaise",
            "expected": "hollandaise",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 267,
            "output_tokens": 7,
            "latency_ms": 585.1,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "hollandaise",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "hollandaise",
            "expected": "hollandaise",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 267,
            "output_tokens": 7,
            "latency_ms": 707.9,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Hollandaise",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "hollandaise",
            "expected": "hollandaise",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 267,
            "output_tokens": 7,
            "latency_ms": 537.6,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 267.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 610.2,
        "usd_sum": 4.8e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-013",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "c831a091be5025f9",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nIn which country did edomae-style sushi originate?\n\nINPUT:\n{\n  \"dish\": \"edomae sushi\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "japan"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Japan",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "japan",
            "expected": "japan",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 251,
            "output_tokens": 4,
            "latency_ms": 604.0,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Japan",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "japan",
            "expected": "japan",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 251,
            "output_tokens": 4,
            "latency_ms": 597.2,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Japan",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "japan",
            "expected": "japan",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 251,
            "output_tokens": 4,
            "latency_ms": 494.8,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 251.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 565.333,
        "usd_sum": 4.2e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-014",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "medium",
      "cuisine_region": "moroccan",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "7d761cbfb86342af",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nIn which country does the slow-cooked dish tagine originate?\n\nINPUT:\n{\n  \"dish\": \"tagine (clay-pot slow-cooked stew)\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "morocco"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Morocco",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "morocco",
            "expected": "morocco",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 259,
            "output_tokens": 5,
            "latency_ms": 616.1,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Morocco",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "morocco",
            "expected": "morocco",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 259,
            "output_tokens": 5,
            "latency_ms": 663.4,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Morocco",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "morocco",
            "expected": "morocco",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 259,
            "output_tokens": 5,
            "latency_ms": 609.2,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 259.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 629.567,
        "usd_sum": 4.5e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-015",
      "category": "regional",
      "capability": "wine_region_knowledge",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "2f35d24bb9d58118",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhat single grape variety is used for white Sancerre AOC?\n\nINPUT:\n{\n  \"wine\": \"Sancerre AOC blanc\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "sauvignon blanc",
        "aliases": [
          "sauvignon_blanc",
          "sauvignon"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Sauvignon Blanc",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sauvignon blanc",
            "expected": "sauvignon blanc",
            "aliases": [
              "sauvignon_blanc",
              "sauvignon"
            ]
          },
          "metrics": {
            "input_tokens": 256,
            "output_tokens": 7,
            "latency_ms": 607.9,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Sauvignon Blanc",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sauvignon blanc",
            "expected": "sauvignon blanc",
            "aliases": [
              "sauvignon_blanc",
              "sauvignon"
            ]
          },
          "metrics": {
            "input_tokens": 256,
            "output_tokens": 7,
            "latency_ms": 517.3,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Sauvignon Blanc",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sauvignon blanc",
            "expected": "sauvignon blanc",
            "aliases": [
              "sauvignon_blanc",
              "sauvignon"
            ]
          },
          "metrics": {
            "input_tokens": 256,
            "output_tokens": 7,
            "latency_ms": 539.5,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 256.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 554.9,
        "usd_sum": 4.8e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-016",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "easy",
      "cuisine_region": "mexican",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "efb489c56faf463a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nIn which country does mole poblano originate?\n\nINPUT:\n{\n  \"dish\": \"mole poblano\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "mexico"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Mexico",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "mexico",
            "expected": "mexico",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 249,
            "output_tokens": 4,
            "latency_ms": 688.8,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Mexico",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "mexico",
            "expected": "mexico",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 249,
            "output_tokens": 4,
            "latency_ms": 727.1,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Mexico",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "mexico",
            "expected": "mexico",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 249,
            "output_tokens": 4,
            "latency_ms": 4873.0,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 249.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 2096.3,
        "usd_sum": 4.2e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-017",
      "category": "regional",
      "capability": "classic_regional_pairing",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "53530e4634162afc",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nName the Alsatian white wine traditionally served with choucroute garnie.\n\nINPUT:\n{\n  \"dish\": \"choucroute garnie\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "riesling"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Riesling",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "riesling",
            "expected": "riesling",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 256,
            "output_tokens": 6,
            "latency_ms": 585.0,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Riesling",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "riesling",
            "expected": "riesling",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 256,
            "output_tokens": 6,
            "latency_ms": 876.6,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Riesling",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "riesling",
            "expected": "riesling",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 256,
            "output_tokens": 6,
            "latency_ms": 524.2,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 256.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 661.933,
        "usd_sum": 4.5e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-018",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "medium",
      "cuisine_region": "brazilian",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "28a636bdc82633de",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhich country claims feijoada completa as a national dish?\n\nINPUT:\n{\n  \"dish\": \"feijoada completa\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "brazil"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Brazil",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "brazil",
            "expected": "brazil",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 252,
            "output_tokens": 4,
            "latency_ms": 566.2,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Brazil",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "brazil",
            "expected": "brazil",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 252,
            "output_tokens": 4,
            "latency_ms": 755.6,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Brazil",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "brazil",
            "expected": "brazil",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 252,
            "output_tokens": 4,
            "latency_ms": 462.6,
            "usd": 1.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 252.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 594.8,
        "usd_sum": 4.2e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-019",
      "category": "regional",
      "capability": "wine_region_knowledge",
      "difficulty": "hard",
      "cuisine_region": "hungarian",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "2c5869ce479f65cc",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nTokaji Aszú's vineyards are centred in a single country. Name that country in English.\n\nINPUT:\n{\n  \"wine\": \"Tokaji Aszú\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "hungary"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Hungary",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "hungary",
            "expected": "hungary",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 261,
            "output_tokens": 5,
            "latency_ms": 594.9,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Hungary",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "hungary",
            "expected": "hungary",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 261,
            "output_tokens": 5,
            "latency_ms": 584.7,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Hungary",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "hungary",
            "expected": "hungary",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 261,
            "output_tokens": 5,
            "latency_ms": 778.8,
            "usd": 1.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 261.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 652.8,
        "usd_sum": 4.5e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-020",
      "category": "regional",
      "capability": "classic_regional_pairing",
      "difficulty": "hard",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "aa3761bdbf2d490e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nName the Galician white wine grape traditionally paired with pulpo á feira (boiled octopus with pimentón).\n\nINPUT:\n{\n  \"dish\": \"pulpo á feira — Galician boiled octopus with pimentón and olive oil\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "albarino",
        "aliases": [
          "albariño"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Albariño",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "albariño",
            "expected": "albarino",
            "aliases": [
              "albariño"
            ]
          },
          "metrics": {
            "input_tokens": 277,
            "output_tokens": 6,
            "latency_ms": 606.5,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "Albariño",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "albariño",
            "expected": "albarino",
            "aliases": [
              "albariño"
            ]
          },
          "metrics": {
            "input_tokens": 277,
            "output_tokens": 6,
            "latency_ms": 589.2,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "Albariño",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "albariño",
            "expected": "albarino",
            "aliases": [
              "albariño"
            ]
          },
          "metrics": {
            "input_tokens": 277,
            "output_tokens": 6,
            "latency_ms": 625.8,
            "usd": 1.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 277.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 607.167,
        "usd_sum": 4.8e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-001",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "08845fcf7f8beb09",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"salmon and tuna nigiri\",\n  \"beverage\": \"young Bordeaux Left Bank (Cabernet Sauvignon, high tannin)\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_TANNIN_FISH"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_TANNIN_FISH",
            "expected": "RISK_TANNIN_FISH",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 636,
            "output_tokens": 10,
            "latency_ms": 603.5,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_TANNIN_FISH",
            "expected": "RISK_TANNIN_FISH",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 636,
            "output_tokens": 10,
            "latency_ms": 790.4,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_TANNIN_FISH",
            "expected": "RISK_TANNIN_FISH",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 636,
            "output_tokens": 10,
            "latency_ms": 902.3,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 636.0,
        "output_tokens_mean": 10.0,
        "latency_ms_mean": 765.4,
        "usd_sum": 0.000108
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-002",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "7a188b424515258f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"Thai green curry with bird's eye chilli, high heat\",\n  \"beverage\": \"15.5% ABV California Zinfandel\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_SPICE_ALCOHOL"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_SPICE_ALCOHOL",
            "expected": "RISK_SPICE_ALCOHOL",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 638,
            "output_tokens": 10,
            "latency_ms": 572.2,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_SPICE_ALCOHOL",
            "expected": "RISK_SPICE_ALCOHOL",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 638,
            "output_tokens": 10,
            "latency_ms": 521.0,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_SPICE_ALCOHOL",
            "expected": "RISK_SPICE_ALCOHOL",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 638,
            "output_tokens": 10,
            "latency_ms": 879.5,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 638.0,
        "output_tokens_mean": 10.0,
        "latency_ms_mean": 657.567,
        "usd_sum": 0.000108
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-003",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "a60fa6dd5c1ff452",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code for the pairing. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"delicate flounder sashimi with ponzu\",\n  \"beverage\": \"heavily new-oaked California Chardonnay, 14% ABV, buttery ML\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_OAK_MASKS"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_OAK_MASKS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_OAK_MASKS",
            "expected": "RISK_OAK_MASKS",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 641,
            "output_tokens": 9,
            "latency_ms": 829.3,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_OAK_MASKS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_OAK_MASKS",
            "expected": "RISK_OAK_MASKS",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 641,
            "output_tokens": 9,
            "latency_ms": 681.8,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_OAK_MASKS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_OAK_MASKS",
            "expected": "RISK_OAK_MASKS",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 641,
            "output_tokens": 9,
            "latency_ms": 814.5,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 641.0,
        "output_tokens_mean": 9.0,
        "latency_ms_mean": 775.2,
        "usd_sum": 0.000108
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-004",
      "category": "risk",
      "capability": "risk_severity",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "16286de7fea14ae1",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nUsing OGS risk severity levels (low/medium/high), classify the RISK_ALCOHOL_OVERPOWERS severity for the pairing below. Respond with just the severity token.\n\nINPUT:\n{\n  \"dish\": \"pan-seared halibut with beurre blanc, very delicate flavours\",\n  \"beverage\": \"15.5% ABV Amarone della Valpolicella\"\n}\n\nANSWER FORMAT: one_of:low,medium,high",
      "expected": {
        "value": "high"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "high",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "high",
            "expected": "high",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 603,
            "output_tokens": 4,
            "latency_ms": 712.3,
            "usd": 3.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "high",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "high",
            "expected": "high",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 603,
            "output_tokens": 4,
            "latency_ms": 575.6,
            "usd": 3.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "high",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "high",
            "expected": "high",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 603,
            "output_tokens": 4,
            "latency_ms": 609.7,
            "usd": 3.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 603.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 632.533,
        "usd_sum": 9.6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-005",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "cd44f8ac361108e8",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code for the pairing. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"caramel panna cotta with honey\",\n  \"beverage\": \"bone-dry Brut Nature Champagne (zero dosage)\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_SWEETNESS_CONFLICT"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_SWEETNESS_CONFLICT",
            "expected": "RISK_SWEETNESS_CONFLICT",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 630,
            "output_tokens": 11,
            "latency_ms": 623.5,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_INTENSITY_MISMATCH",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "RISK_INTENSITY_MISMATCH",
            "expected": "RISK_SWEETNESS_CONFLICT",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 630,
            "output_tokens": 11,
            "latency_ms": 789.0,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_ACID_CLASH",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "RISK_ACID_CLASH",
            "expected": "RISK_SWEETNESS_CONFLICT",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 630,
            "output_tokens": 9,
            "latency_ms": 811.0,
            "usd": 3.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.3333,
      "score_summary": {
        "mean": 0.3333,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": -0.32,
        "ci95_hi": 0.9867,
        "n": 3
      },
      "score_if_valid": 0.3333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 630.0,
        "output_tokens_mean": 10.333,
        "latency_ms_mean": 741.167,
        "usd_sum": 0.000107
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-006",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "easy",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "28cb271326a709c7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"raw tuna sashimi with a light soy-wasabi dip\",\n  \"beverage\": \"young tannic Napa Cabernet Sauvignon\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_TANNIN_FISH"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_tannin_fish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 636,
            "output_tokens": 10,
            "latency_ms": 603.9,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_tannin_fish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 636,
            "output_tokens": 10,
            "latency_ms": 2040.9,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_tannin_fish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 636,
            "output_tokens": 10,
            "latency_ms": 614.3,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 636.0,
        "output_tokens_mean": 10.0,
        "latency_ms_mean": 1086.367,
        "usd_sum": 0.000108
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-007",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "thai",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "93c46eac23eca249",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"spicy Thai green curry with chilies and coconut milk\",\n  \"beverage\": \"15.5% abv California Zinfandel\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_SPICE_ALCOHOL"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_spice_alcohol",
            "expected": "risk_spice_alcohol",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 639,
            "output_tokens": 10,
            "latency_ms": 664.8,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_spice_alcohol",
            "expected": "risk_spice_alcohol",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 639,
            "output_tokens": 10,
            "latency_ms": 1054.5,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_spice_alcohol",
            "expected": "risk_spice_alcohol",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 639,
            "output_tokens": 10,
            "latency_ms": 610.4,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 639.0,
        "output_tokens_mean": 10.0,
        "latency_ms_mean": 776.567,
        "usd_sum": 0.000108
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-008",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "2409e606629eebf9",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"grilled escarole salad with bitter radicchio and walnuts\",\n  \"beverage\": \"bone-dry bitter amontillado sherry\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_BITTER_AMPLIFY"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_BITTER_AMPLIFY",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_bitter_amplify",
            "expected": "risk_bitter_amplify",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 641,
            "output_tokens": 10,
            "latency_ms": 576.3,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_BITTER_AMPLIFY",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_bitter_amplify",
            "expected": "risk_bitter_amplify",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 641,
            "output_tokens": 10,
            "latency_ms": 622.8,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_BITTER_AMPLIFY",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_bitter_amplify",
            "expected": "risk_bitter_amplify",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 641,
            "output_tokens": 10,
            "latency_ms": 639.6,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 641.0,
        "output_tokens_mean": 10.0,
        "latency_ms_mean": 612.9,
        "usd_sum": 0.000108
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-009",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "a00498fd7fce7cb7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"raw scallop crudo with yuzu\",\n  \"beverage\": \"heavily oaked California Chardonnay, new French barrels\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_OAK_MASKS"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_OAK_MASKS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_oak_masks",
            "expected": "risk_oak_masks",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 637,
            "output_tokens": 9,
            "latency_ms": 689.6,
            "usd": 3.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_OAK_MASKS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_oak_masks",
            "expected": "risk_oak_masks",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 637,
            "output_tokens": 9,
            "latency_ms": 649.0,
            "usd": 3.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_OAK_MASKS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_oak_masks",
            "expected": "risk_oak_masks",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 637,
            "output_tokens": 9,
            "latency_ms": 801.7,
            "usd": 3.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 637.0,
        "output_tokens_mean": 9.0,
        "latency_ms_mean": 713.433,
        "usd_sum": 0.000105
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-010",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "b28af5fc40407ce0",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"warm clear beef consommé\",\n  \"beverage\": \"sweet late-harvest Moscato d'Asti\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_SWEETNESS_CONFLICT"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_sweetness_conflict",
            "expected": "risk_sweetness_conflict",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 633,
            "output_tokens": 11,
            "latency_ms": 551.0,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_sweetness_conflict",
            "expected": "risk_sweetness_conflict",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 633,
            "output_tokens": 11,
            "latency_ms": 685.0,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_sweetness_conflict",
            "expected": "risk_sweetness_conflict",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 633,
            "output_tokens": 11,
            "latency_ms": 573.9,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 633.0,
        "output_tokens_mean": 11.0,
        "latency_ms_mean": 603.3,
        "usd_sum": 0.000108
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-011",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "fba6f85afa81595f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"frisée salad with aggressive Dijon vinaigrette (1:2 vinegar to oil)\",\n  \"beverage\": \"bone-dry Brut Nature Champagne\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_ACID_CLASH"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_sweetness_conflict",
            "expected": "risk_acid_clash",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 640,
            "output_tokens": 11,
            "latency_ms": 724.4,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_ACID_CLASH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_acid_clash",
            "expected": "risk_acid_clash",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 640,
            "output_tokens": 9,
            "latency_ms": 697.4,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_INTENSITY_MISMATCH",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_intensity_mismatch",
            "expected": "risk_acid_clash",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 640,
            "output_tokens": 11,
            "latency_ms": 632.5,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.3333,
      "score_summary": {
        "mean": 0.3333,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": -0.32,
        "ci95_hi": 0.9867,
        "n": 3
      },
      "score_if_valid": 0.3333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 640.0,
        "output_tokens_mean": 10.333,
        "latency_ms_mean": 684.767,
        "usd_sum": 0.000108
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-012",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "easy",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "2f3a4360a39afdc9",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"steamed turbot with beurre blanc\",\n  \"beverage\": \"young powerful Barolo\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_INTENSITY_MISMATCH"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_INTENSITY_MISMATCH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_intensity_mismatch",
            "expected": "risk_intensity_mismatch",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 629,
            "output_tokens": 11,
            "latency_ms": 682.7,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_intensity_mismatch",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 629,
            "output_tokens": 10,
            "latency_ms": 680.7,
            "usd": 3.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_intensity_mismatch",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 629,
            "output_tokens": 10,
            "latency_ms": 791.9,
            "usd": 3.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.3333,
      "score_summary": {
        "mean": 0.3333,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": -0.32,
        "ci95_hi": 0.9867,
        "n": 3
      },
      "score_if_valid": 0.3333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 629.0,
        "output_tokens_mean": 10.333,
        "latency_ms_mean": 718.433,
        "usd_sum": 0.000106
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-013",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "390e90cae060ada8",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"yellowtail sashimi\",\n  \"beverage\": \"overproof 63% abv Jamaican rum\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_ALCOHOL_OVERPOWERS"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_ALCOHOL_OVERPOWERS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_alcohol_overpowers",
            "expected": "risk_alcohol_overpowers",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 631,
            "output_tokens": 12,
            "latency_ms": 783.2,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_ALCOHOL_OVERPOWERS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_alcohol_overpowers",
            "expected": "risk_alcohol_overpowers",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 631,
            "output_tokens": 12,
            "latency_ms": 627.3,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_ALCOHOL_OVERPOWERS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_alcohol_overpowers",
            "expected": "risk_alcohol_overpowers",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 631,
            "output_tokens": 12,
            "latency_ms": 573.2,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 631.0,
        "output_tokens_mean": 12.0,
        "latency_ms_mean": 661.233,
        "usd_sum": 0.000108
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-014",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "d83b8b2ecabe9907",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"gin-cured salmon gravlax with dill\",\n  \"beverage\": \"young tannic Chianti Classico\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_TANNIN_FISH"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_tannin_fish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 634,
            "output_tokens": 10,
            "latency_ms": 465.8,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_tannin_fish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 634,
            "output_tokens": 10,
            "latency_ms": 712.7,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_tannin_fish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 634,
            "output_tokens": 10,
            "latency_ms": 847.0,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 634.0,
        "output_tokens_mean": 10.0,
        "latency_ms_mean": 675.167,
        "usd_sum": 0.000108
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-015",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "1e83097d483e0639",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"poached lobster with drawn butter\",\n  \"beverage\": \"aggressive 80-IBU West Coast IPA\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_BITTER_AMPLIFY"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_BITTER_AMPLIFY",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_bitter_amplify",
            "expected": "risk_bitter_amplify",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 634,
            "output_tokens": 10,
            "latency_ms": 581.9,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_ALCOHOL_OVERPOWERS",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_alcohol_overpowers",
            "expected": "risk_bitter_amplify",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 634,
            "output_tokens": 12,
            "latency_ms": 661.6,
            "usd": 3.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_BITTER_AMPLIFY",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_bitter_amplify",
            "expected": "risk_bitter_amplify",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 634,
            "output_tokens": 10,
            "latency_ms": 794.4,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": 0.0133,
        "ci95_hi": 1.32,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 634.0,
        "output_tokens_mean": 10.667,
        "latency_ms_mean": 679.3,
        "usd_sum": 0.000109
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-016",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "easy",
      "cuisine_region": "mexican",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "b2e02272ca87b2d4",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"habanero-heavy Yucatán salsa\",\n  \"beverage\": \"15% abv Zinfandel\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_SPICE_ALCOHOL"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_spice_alcohol",
            "expected": "risk_spice_alcohol",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 633,
            "output_tokens": 10,
            "latency_ms": 562.0,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_spice_alcohol",
            "expected": "risk_spice_alcohol",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 633,
            "output_tokens": 10,
            "latency_ms": 617.2,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_spice_alcohol",
            "expected": "risk_spice_alcohol",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 633,
            "output_tokens": 10,
            "latency_ms": 648.4,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 633.0,
        "output_tokens_mean": 10.0,
        "latency_ms_mean": 609.2,
        "usd_sum": 0.000108
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-017",
      "category": "risk",
      "capability": "risk_severity",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "12372c8de13add55",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nUsing OGS risk severity levels (low/medium/high), classify the risk severity of the described pairing. Respond with just the severity level.\n\nINPUT:\n{\n  \"dish\": \"simple grilled branzino with lemon\",\n  \"beverage\": \"heavily oaked California Chardonnay\",\n  \"risk_code\": \"RISK_OAK_MASKS\"\n}\n\nANSWER FORMAT: one_of:low,medium,high",
      "expected": {
        "value": "medium"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "high",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "high",
            "expected": "medium",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 596,
            "output_tokens": 4,
            "latency_ms": 531.6,
            "usd": 3.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "high",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "high",
            "expected": "medium",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 596,
            "output_tokens": 4,
            "latency_ms": 529.2,
            "usd": 3.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "high",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "high",
            "expected": "medium",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 596,
            "output_tokens": 4,
            "latency_ms": 492.1,
            "usd": 3.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 596.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 517.633,
        "usd_sum": 9.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-018",
      "category": "risk",
      "capability": "risk_severity",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "7a97c002792dfad9",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nUsing OGS risk severity levels, classify the severity of a tannic red wine with raw anchovies.\n\nINPUT:\n{\n  \"dish\": \"raw marinated anchovies\",\n  \"beverage\": \"young tannic Cabernet Sauvignon\",\n  \"risk_code\": \"RISK_TANNIN_FISH\"\n}\n\nANSWER FORMAT: one_of:low,medium,high",
      "expected": {
        "value": "high"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "high",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "high",
            "expected": "high",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 586,
            "output_tokens": 4,
            "latency_ms": 1161.3,
            "usd": 3.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "high",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "high",
            "expected": "high",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 586,
            "output_tokens": 4,
            "latency_ms": 589.7,
            "usd": 3.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "high",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "high",
            "expected": "high",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 586,
            "output_tokens": 4,
            "latency_ms": 541.4,
            "usd": 3.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 586.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 764.133,
        "usd_sum": 9.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-019",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "german",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "0e3d6c3f9d86af0b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"raw Kumamoto oyster\",\n  \"beverage\": \"off-dry Spätlese Riesling with 45 g/L residual sugar\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_SWEETNESS_CONFLICT"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_sweetness_conflict",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 638,
            "output_tokens": 10,
            "latency_ms": 661.0,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_sweetness_conflict",
            "expected": "risk_sweetness_conflict",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 638,
            "output_tokens": 11,
            "latency_ms": 879.1,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_sweetness_conflict",
            "expected": "risk_sweetness_conflict",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 638,
            "output_tokens": 11,
            "latency_ms": 648.0,
            "usd": 3.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": 0.0133,
        "ci95_hi": 1.32,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 638.0,
        "output_tokens_mean": 10.667,
        "latency_ms_mean": 729.367,
        "usd_sum": 0.000108
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-020",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "2dd649d25c606ee7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"slow-braised oxtail stew with dark jus\",\n  \"beverage\": \"very pale Provençal rosé, 12.5% abv\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_INTENSITY_MISMATCH"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_ALCOHOL_OVERPOWERS",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_alcohol_overpowers",
            "expected": "risk_intensity_mismatch",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 642,
            "output_tokens": 12,
            "latency_ms": 565.1,
            "usd": 3.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_ALCOHOL_OVERPOWERS",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_alcohol_overpowers",
            "expected": "risk_intensity_mismatch",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 642,
            "output_tokens": 12,
            "latency_ms": 1049.9,
            "usd": 3.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_ALCOHOL_OVERPOWERS",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_alcohol_overpowers",
            "expected": "risk_intensity_mismatch",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 642,
            "output_tokens": 12,
            "latency_ms": 657.0,
            "usd": 3.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 642.0,
        "output_tokens_mean": 12.0,
        "latency_ms_mean": 757.333,
        "usd_sum": 0.000111
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-001",
      "category": "safety",
      "capability": "cooking_temperature",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "8e37b185cfaed2aa",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nWhat is the minimum core temperature (°C) required to safely cook poultry under the OGS HACCP reference? Respond with a single number.\n\nINPUT:\n{\n  \"product\": \"whole chicken\"\n}\n\nANSWER FORMAT: numeric:integer or decimal °C",
      "expected": {
        "value": 75,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 794,
            "output_tokens": 4,
            "latency_ms": 496.1,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 794,
            "output_tokens": 4,
            "latency_ms": 698.3,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 794,
            "output_tokens": 4,
            "latency_ms": 9149.8,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 794.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 3448.067,
        "usd_sum": 0.000123
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-002",
      "category": "safety",
      "capability": "cooking_temperature",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "d38d6470a18f7e97",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nA restaurant serves minced-beef smash burgers. What minimum core temperature (°C) should the patties reach? Respond with a single number.\n\nINPUT:\n{\n  \"product\": \"minced beef patty\"\n}\n\nANSWER FORMAT: numeric:integer °C",
      "expected": {
        "value": 75,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 795,
            "output_tokens": 4,
            "latency_ms": 671.8,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 795,
            "output_tokens": 4,
            "latency_ms": 706.3,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 795,
            "output_tokens": 4,
            "latency_ms": 568.5,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 795.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 648.867,
        "usd_sum": 0.000123
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-003",
      "category": "safety",
      "capability": "danger_zone",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "97f1d8e6fc44b9e2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nThe bacterial danger zone is defined between two temperatures in °C. Respond as 'LOW-HIGH' in °C (integers), e.g. '10-50'.\n\nANSWER FORMAT: exact:LOW-HIGH °C",
      "expected": {
        "value": "5-63",
        "aliases": [
          "5 to 63",
          "5-63°C",
          "5 C - 63 C"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "5-63 °C",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "5-63 °c",
            "expected": "5-63",
            "aliases": [
              "5 to 63",
              "5-63°C",
              "5 C - 63 C"
            ]
          },
          "metrics": {
            "input_tokens": 788,
            "output_tokens": 8,
            "latency_ms": 678.6,
            "usd": 4.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "5-63 °C",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "5-63 °c",
            "expected": "5-63",
            "aliases": [
              "5 to 63",
              "5-63°C",
              "5 C - 63 C"
            ]
          },
          "metrics": {
            "input_tokens": 788,
            "output_tokens": 8,
            "latency_ms": 787.2,
            "usd": 4.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "5-63 °C",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "5-63 °c",
            "expected": "5-63",
            "aliases": [
              "5 to 63",
              "5-63°C",
              "5 C - 63 C"
            ]
          },
          "metrics": {
            "input_tokens": 788,
            "output_tokens": 8,
            "latency_ms": 614.0,
            "usd": 4.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 788.0,
        "output_tokens_mean": 8.0,
        "latency_ms_mean": 693.267,
        "usd_sum": 0.000129
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-004",
      "category": "safety",
      "capability": "cooling_rule",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "9e5770b6c7036769",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nUnder UK FSA guidance, hot food must be cooled from 63 °C down to 21 °C within how many hours? Respond with a single integer.\n\nANSWER FORMAT: numeric:integer hours",
      "expected": {
        "value": 2,
        "tolerance": 0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "2",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.0,
            "expected": 2.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 783,
            "output_tokens": 4,
            "latency_ms": 602.5,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "2",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.0,
            "expected": 2.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 783,
            "output_tokens": 4,
            "latency_ms": 634.5,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "2",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.0,
            "expected": 2.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 783,
            "output_tokens": 4,
            "latency_ms": 628.7,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 783.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 621.9,
        "usd_sum": 0.000123
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-005",
      "category": "safety",
      "capability": "reheating",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "cb2ff73059921b59",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nTo what minimum core temperature (°C) must a previously cooked and chilled dish be reheated under OGS HACCP? Respond with a single number.\n\nANSWER FORMAT: numeric:integer °C",
      "expected": {
        "value": 75,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 782,
            "output_tokens": 4,
            "latency_ms": 736.0,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 782,
            "output_tokens": 4,
            "latency_ms": 539.9,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 782,
            "output_tokens": 4,
            "latency_ms": 488.1,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 782.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 588.0,
        "usd_sum": 0.000123
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-006",
      "category": "safety",
      "capability": "hot_holding",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "b3c9f3a848d65390",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nWhat minimum temperature (°C) must hot food be held at for service? Respond with a single number.\n\nANSWER FORMAT: numeric:integer °C",
      "expected": {
        "value": 63,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "63",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 63.0,
            "expected": 63.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 773,
            "output_tokens": 4,
            "latency_ms": 600.3,
            "usd": 4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "63",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 63.0,
            "expected": 63.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 773,
            "output_tokens": 4,
            "latency_ms": 588.6,
            "usd": 4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "63",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 63.0,
            "expected": 63.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 773,
            "output_tokens": 4,
            "latency_ms": 569.3,
            "usd": 4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 773.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 586.067,
        "usd_sum": 0.00012
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-007",
      "category": "safety",
      "capability": "cold_chain",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "f120f4a38737e981",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nWhat is the maximum temperature (°C) for frozen storage under OGS HACCP? Respond with a single number (negative).\n\nANSWER FORMAT: numeric:integer °C",
      "expected": {
        "value": -18,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "-18",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": -18.0,
            "expected": -18.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 777,
            "output_tokens": 5,
            "latency_ms": 716.0,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "-18",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": -18.0,
            "expected": -18.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 777,
            "output_tokens": 5,
            "latency_ms": 607.4,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "-18",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": -18.0,
            "expected": -18.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 777,
            "output_tokens": 5,
            "latency_ms": 635.0,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 777.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 652.8,
        "usd_sum": 0.000123
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-008",
      "category": "safety",
      "capability": "thawing_method",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "9b8f975bd042c7a0",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nWhich of the following thawing methods is explicitly NOT permitted under OGS HACCP: 'in the fridge at 4 °C', 'under cold running water', 'in the microwave just before cooking', 'at room temperature on the counter'? Respond with the exact method phrase.\n\nINPUT:\n{\n  \"methods\": [\n    \"in the fridge at 4 °C\",\n    \"under cold running water\",\n    \"in the microwave just before cooking\",\n    \"at room temperature on the counter\"\n  ]\n}\n\nANSWER FORMAT: exact:method phrase",
      "expected": {
        "value": "at room temperature on the counter",
        "aliases": [
          "room temperature",
          "on the counter",
          "room-temperature thawing"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "at room temperature on the counter",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "at room temperature on the counter",
            "expected": "at room temperature on the counter",
            "aliases": [
              "room temperature",
              "on the counter",
              "room-temperature thawing"
            ]
          },
          "metrics": {
            "input_tokens": 853,
            "output_tokens": 9,
            "latency_ms": 623.1,
            "usd": 4.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "at room temperature on the counter",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "at room temperature on the counter",
            "expected": "at room temperature on the counter",
            "aliases": [
              "room temperature",
              "on the counter",
              "room-temperature thawing"
            ]
          },
          "metrics": {
            "input_tokens": 853,
            "output_tokens": 9,
            "latency_ms": 957.3,
            "usd": 4.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "at room temperature on the counter",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "at room temperature on the counter",
            "expected": "at room temperature on the counter",
            "aliases": [
              "room temperature",
              "on the counter",
              "room-temperature thawing"
            ]
          },
          "metrics": {
            "input_tokens": 853,
            "output_tokens": 9,
            "latency_ms": 526.9,
            "usd": 4.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 853.0,
        "output_tokens_mean": 9.0,
        "latency_ms_mean": 702.433,
        "usd_sum": 0.000138
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-009",
      "category": "safety",
      "capability": "ccp_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "set_f1",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "9f4b49c1c20e4376",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nFor a cook-chill dish of chicken curry with rice, identify the Critical Control Points from the OGS stage list. Return a comma-separated list using the OGS CCP stage IDs.\n\nINPUT:\n{\n  \"dish\": \"cook-chill chicken curry with rice\"\n}\n\nANSWER FORMAT: set:comma-separated CCP stage IDs",
      "expected": {
        "values": [
          "cooking",
          "cooling",
          "storage_cold",
          "reheating",
          "hot_holding"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "receiving,storage_cold,thawing,cooking,hot_holding,cooling,reheating,service",
          "compliant": true,
          "score": 0.7692,
          "detail": {
            "predicted": [
              "cooking",
              "cooling",
              "hot_holding",
              "receiving",
              "reheating",
              "service",
              "storage_cold",
              "thawing"
            ],
            "expected": [
              "cooking",
              "cooling",
              "hot_holding",
              "reheating",
              "storage_cold"
            ],
            "precision": 0.625,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 807,
            "output_tokens": 26,
            "latency_ms": 716.6,
            "usd": 5.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "cooking,hot_holding,cooling,reheating,service",
          "compliant": true,
          "score": 0.8,
          "detail": {
            "predicted": [
              "cooking",
              "cooling",
              "hot_holding",
              "reheating",
              "service"
            ],
            "expected": [
              "cooking",
              "cooling",
              "hot_holding",
              "reheating",
              "storage_cold"
            ],
            "precision": 0.8,
            "recall": 0.8
          },
          "metrics": {
            "input_tokens": 807,
            "output_tokens": 16,
            "latency_ms": 593.0,
            "usd": 4.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "cooking,hot_holding,cooling,reheating,service",
          "compliant": true,
          "score": 0.8,
          "detail": {
            "predicted": [
              "cooking",
              "cooling",
              "hot_holding",
              "reheating",
              "service"
            ],
            "expected": [
              "cooking",
              "cooling",
              "hot_holding",
              "reheating",
              "storage_cold"
            ],
            "precision": 0.8,
            "recall": 0.8
          },
          "metrics": {
            "input_tokens": 807,
            "output_tokens": 16,
            "latency_ms": 666.0,
            "usd": 4.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.7897,
      "score_summary": {
        "mean": 0.7897,
        "stdev": 0.0178,
        "sem": 0.0103,
        "ci95_lo": 0.7696,
        "ci95_hi": 0.8099,
        "n": 3
      },
      "score_if_valid": 0.7897,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 807.0,
        "output_tokens_mean": 19.333,
        "latency_ms_mean": 658.533,
        "usd_sum": 0.000145
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-010",
      "category": "safety",
      "capability": "cooking_temperature",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "eea6814938f18565",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nA bistro serves a rare whole-muscle ribeye (seared exterior, internal 46 °C). Under OGS HACCP, is this permitted? Answer yes or no.\n\nINPUT:\n{\n  \"product\": \"whole-muscle ribeye\",\n  \"core_c\": 46\n}\n\nANSWER FORMAT: exact:yes|no",
      "expected": {
        "value": "yes"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "no",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "no",
            "expected": "yes",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 811,
            "output_tokens": 4,
            "latency_ms": 518.3,
            "usd": 4.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "no",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "no",
            "expected": "yes",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 811,
            "output_tokens": 4,
            "latency_ms": 592.3,
            "usd": 4.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "no",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "no",
            "expected": "yes",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 811,
            "output_tokens": 4,
            "latency_ms": 758.1,
            "usd": 4.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 811.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 622.9,
        "usd_sum": 0.000126
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-011",
      "category": "safety",
      "capability": "at_risk_groups",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "edc2981cec11c39e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nA children's hospital must serve hollandaise sauce. Which is the ONE compliant option: 'fresh shell eggs at 63 °C', 'fresh shell eggs at 70 °C', 'pasteurised egg yolk', 'raw shell eggs with lemon juice'? Respond with the exact phrase.\n\nINPUT:\n{\n  \"options\": [\n    \"fresh shell eggs at 63 °C\",\n    \"fresh shell eggs at 70 °C\",\n    \"pasteurised egg yolk\",\n    \"raw shell eggs with lemon juice\"\n  ]\n}\n\nANSWER FORMAT: exact:option phrase",
      "expected": {
        "value": "pasteurised egg yolk",
        "aliases": [
          "pasteurised egg",
          "pasteurized egg yolk",
          "pasteurized egg"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "pasteurised egg yolk",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "pasteurised egg yolk",
            "expected": "pasteurised egg yolk",
            "aliases": [
              "pasteurised egg",
              "pasteurized egg yolk",
              "pasteurized egg"
            ]
          },
          "metrics": {
            "input_tokens": 860,
            "output_tokens": 9,
            "latency_ms": 603.1,
            "usd": 4.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "pasteurised egg yolk",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "pasteurised egg yolk",
            "expected": "pasteurised egg yolk",
            "aliases": [
              "pasteurised egg",
              "pasteurized egg yolk",
              "pasteurized egg"
            ]
          },
          "metrics": {
            "input_tokens": 860,
            "output_tokens": 9,
            "latency_ms": 612.9,
            "usd": 4.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "pasteurised egg yolk",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "pasteurised egg yolk",
            "expected": "pasteurised egg yolk",
            "aliases": [
              "pasteurised egg",
              "pasteurized egg yolk",
              "pasteurized egg"
            ]
          },
          "metrics": {
            "input_tokens": 860,
            "output_tokens": 9,
            "latency_ms": 812.4,
            "usd": 4.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 860.0,
        "output_tokens_mean": 9.0,
        "latency_ms_mean": 676.133,
        "usd_sum": 0.000141
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-012",
      "category": "safety",
      "capability": "numeric_vector",
      "difficulty": "hard",
      "cuisine_region": "global",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "2ba4113ffe26b2fc",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nReturn the canonical HACCP reference temperatures in °C for this dish flow (poultry cook-chill operation): [cold_storage_max, poultry_core_cook_min, hot_hold_min, reheat_min, frozen_storage_max]. Respond with a JSON array of five numbers only, in that order.\n\nINPUT:\n{\n  \"flow\": \"poultry cook-chill\"\n}\n\nANSWER FORMAT: json:array of 5 numbers",
      "expected": {
        "values": [
          5,
          75,
          63,
          75,
          -18
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[5,75,63,75,-18]",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 5,
                "expected": 5,
                "score": 1.0
              },
              {
                "index": 1,
                "predicted": 75,
                "expected": 75,
                "score": 1.0
              },
              {
                "index": 2,
                "predicted": 63,
                "expected": 63,
                "score": 1.0
              },
              {
                "index": 3,
                "predicted": 75,
                "expected": 75,
                "score": 1.0
              },
              {
                "index": 4,
                "predicted": -18,
                "expected": -18,
                "score": 1.0
              }
            ],
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 829,
            "output_tokens": 14,
            "latency_ms": 674.6,
            "usd": 4.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[5,75,63,75,-18]",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 5,
                "expected": 5,
                "score": 1.0
              },
              {
                "index": 1,
                "predicted": 75,
                "expected": 75,
                "score": 1.0
              },
              {
                "index": 2,
                "predicted": 63,
                "expected": 63,
                "score": 1.0
              },
              {
                "index": 3,
                "predicted": 75,
                "expected": 75,
                "score": 1.0
              },
              {
                "index": 4,
                "predicted": -18,
                "expected": -18,
                "score": 1.0
              }
            ],
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 829,
            "output_tokens": 14,
            "latency_ms": 834.9,
            "usd": 4.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[5, 75, 63, 75, -18]",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 5,
                "expected": 5,
                "score": 1.0
              },
              {
                "index": 1,
                "predicted": 75,
                "expected": 75,
                "score": 1.0
              },
              {
                "index": 2,
                "predicted": 63,
                "expected": 63,
                "score": 1.0
              },
              {
                "index": 3,
                "predicted": 75,
                "expected": 75,
                "score": 1.0
              },
              {
                "index": 4,
                "predicted": -18,
                "expected": -18,
                "score": 1.0
              }
            ],
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 829,
            "output_tokens": 18,
            "latency_ms": 721.7,
            "usd": 4.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 829.0,
        "output_tokens_mean": 15.333,
        "latency_ms_mean": 743.733,
        "usd_sum": 0.000143
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-001",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "913118356da8150b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the umami intensity of a 36-month aged Parmigiano-Reggiano on the OGS 0-10 intensity scale (0 = not present, 10 = extreme). Respond with only a single decimal number.\n\nINPUT:\n{\n  \"ingredient\": \"Parmigiano-Reggiano 36 months\",\n  \"attribute\": \"umami\"\n}\n\nANSWER FORMAT: integer_0_10",
      "expected": {
        "value": 8.5,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "8.6",
          "compliant": true,
          "score": 0.9333,
          "detail": {
            "predicted": 8.6,
            "expected": 8.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 378,
            "output_tokens": 6,
            "latency_ms": 581.0,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "9.2",
          "compliant": true,
          "score": 0.5333,
          "detail": {
            "predicted": 9.2,
            "expected": 8.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 378,
            "output_tokens": 6,
            "latency_ms": 629.8,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "9.0",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": 9.0,
            "expected": 8.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 378,
            "output_tokens": 6,
            "latency_ms": 724.5,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.7111,
      "score_summary": {
        "mean": 0.7111,
        "stdev": 0.2037,
        "sem": 0.1176,
        "ci95_lo": 0.4806,
        "ci95_hi": 0.9416,
        "n": 3
      },
      "score_if_valid": 0.7111,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 378.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 645.1,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-002",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "2690028d89b60292",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the sour intensity of undiluted fresh lemon juice on the OGS 0-10 intensity scale. Respond with only a single decimal number.\n\nINPUT:\n{\n  \"ingredient\": \"fresh lemon juice (undiluted)\",\n  \"attribute\": \"sour\"\n}\n\nANSWER FORMAT: integer_0_10",
      "expected": {
        "value": 9.5,
        "tolerance": 1.0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "9.0",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": 9.0,
            "expected": 9.5,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 360,
            "output_tokens": 6,
            "latency_ms": 777.6,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "8.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 8.5,
            "expected": 9.5,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 360,
            "output_tokens": 6,
            "latency_ms": 543.6,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "8.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 8.5,
            "expected": 9.5,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 360,
            "output_tokens": 6,
            "latency_ms": 671.8,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.1667,
      "score_summary": {
        "mean": 0.1667,
        "stdev": 0.2887,
        "sem": 0.1667,
        "ci95_lo": -0.16,
        "ci95_hi": 0.4933,
        "n": 3
      },
      "score_if_valid": 0.1667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 360.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 664.333,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-003",
      "category": "sensory",
      "capability": "structural_estimation",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "b09502eb4fd5d61f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nEstimate the tannin intensity of a young (under 5 years old) Barolo on the OGS 0-10 intensity scale. Respond with only a single decimal number.\n\nINPUT:\n{\n  \"beverage\": \"young Barolo, <5 years\",\n  \"attribute\": \"tannin\"\n}\n\nANSWER FORMAT: integer_0_10",
      "expected": {
        "value": 8.5,
        "tolerance": 1.0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "4.0",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 4.0,
            "expected": 8.5,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 348,
            "output_tokens": 6,
            "latency_ms": 609.6,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "2.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 2.5,
            "expected": 8.5,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 348,
            "output_tokens": 6,
            "latency_ms": 450.7,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "2.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 2.5,
            "expected": 8.5,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 348,
            "output_tokens": 6,
            "latency_ms": 582.6,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 348.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 547.633,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-004",
      "category": "sensory",
      "capability": "basic_taste_vector",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "94f009713056d52f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nReturn the five basic tastes for a ripe, in-season strawberry, each on the OGS 0-10 intensity scale. Respond as a single JSON object with keys sweet, sour, salty, bitter, umami.\n\nINPUT:\n{\n  \"ingredient\": \"ripe, in-season strawberry\"\n}\n\nANSWER FORMAT: json\n\nRESPONSE JSON SCHEMA:\n{\n  \"type\": \"object\",\n  \"required\": [\n    \"sweet\",\n    \"sour\",\n    \"salty\",\n    \"bitter\",\n    \"umami\"\n  ],\n  \"properties\": {\n    \"sweet\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"sour\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"salty\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"bitter\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"umami\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    }\n  },\n  \"additionalProperties\": false\n}",
      "expected": {
        "values": {
          "sweet": 7.0,
          "sour": 4.0,
          "salty": 0.5,
          "bitter": 1.0,
          "umami": 0.5
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\"sweet\":8.2,\"sour\":3.6,\"salty\":0.2,\"bitter\":0.3,\"umami\":0.5}",
          "compliant": true,
          "score": 0.74,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 8.2,
                "expected": 7.0,
                "score": 0.4
              },
              "sour": {
                "predicted": 3.6,
                "expected": 4.0,
                "score": 0.8
              },
              "salty": {
                "predicted": 0.2,
                "expected": 0.5,
                "score": 0.85
              },
              "bitter": {
                "predicted": 0.3,
                "expected": 1.0,
                "score": 0.65
              },
              "umami": {
                "predicted": 0.5,
                "expected": 0.5,
                "score": 1.0
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 566,
            "output_tokens": 38,
            "latency_ms": 856.1,
            "usd": 4.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\"sweet\":8.5,\"sour\":3.5,\"salty\":0.2,\"bitter\":0.4,\"umami\":0.3}",
          "compliant": true,
          "score": 0.69,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 8.5,
                "expected": 7.0,
                "score": 0.25
              },
              "sour": {
                "predicted": 3.5,
                "expected": 4.0,
                "score": 0.75
              },
              "salty": {
                "predicted": 0.2,
                "expected": 0.5,
                "score": 0.85
              },
              "bitter": {
                "predicted": 0.4,
                "expected": 1.0,
                "score": 0.7
              },
              "umami": {
                "predicted": 0.3,
                "expected": 0.5,
                "score": 0.9
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 566,
            "output_tokens": 38,
            "latency_ms": 767.1,
            "usd": 4.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\"sweet\":8.5,\"sour\":2.5,\"salty\":0.2,\"bitter\":0.3,\"umami\":0.1}",
          "compliant": true,
          "score": 0.56,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 8.5,
                "expected": 7.0,
                "score": 0.25
              },
              "sour": {
                "predicted": 2.5,
                "expected": 4.0,
                "score": 0.25
              },
              "salty": {
                "predicted": 0.2,
                "expected": 0.5,
                "score": 0.85
              },
              "bitter": {
                "predicted": 0.3,
                "expected": 1.0,
                "score": 0.65
              },
              "umami": {
                "predicted": 0.1,
                "expected": 0.5,
                "score": 0.8
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 566,
            "output_tokens": 38,
            "latency_ms": 793.6,
            "usd": 4.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.6633,
      "score_summary": {
        "mean": 0.6633,
        "stdev": 0.0929,
        "sem": 0.0536,
        "ci95_lo": 0.5582,
        "ci95_hi": 0.7685,
        "n": 3
      },
      "score_if_valid": 0.6633,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 566.0,
        "output_tokens_mean": 38.0,
        "latency_ms_mean": 805.6,
        "usd_sum": 0.000132
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-005",
      "category": "sensory",
      "capability": "structural_estimation",
      "difficulty": "medium",
      "cuisine_region": "american",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "c950ec9851ad52bf",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nEstimate the body of a New World (California) oaked Chardonnay at 14% ABV on the OGS 0-10 body scale (0 = water-thin, 10 = syrupy). Respond with only a single decimal number.\n\nINPUT:\n{\n  \"beverage\": \"oaked California Chardonnay, 14% ABV\",\n  \"attribute\": \"body\"\n}\n\nANSWER FORMAT: integer_0_10",
      "expected": {
        "value": 7.0,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "8.0",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": 8.0,
            "expected": 7.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 364,
            "output_tokens": 6,
            "latency_ms": 727.3,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "7.0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 7.0,
            "expected": 7.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 364,
            "output_tokens": 6,
            "latency_ms": 616.3,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "7.0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 7.0,
            "expected": 7.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 364,
            "output_tokens": 6,
            "latency_ms": 640.0,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.7778,
      "score_summary": {
        "mean": 0.7778,
        "stdev": 0.3849,
        "sem": 0.2222,
        "ci95_lo": 0.3422,
        "ci95_hi": 1.2133,
        "n": 3
      },
      "score_if_valid": 0.7778,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 364.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 661.2,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-006",
      "category": "sensory",
      "capability": "dominant_aromatic_family",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "d68ef8562addd55d",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nIdentify the single dominant aromatic FAMILY (from the OGS vocabulary) of a classic Marlborough Sauvignon Blanc. Respond with just the family name.\n\nINPUT:\n{\n  \"beverage\": \"Marlborough Sauvignon Blanc\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "vegetal"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "herbal",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "herbal",
            "expected": "vegetal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 329,
            "output_tokens": 5,
            "latency_ms": 677.9,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "herbal",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "herbal",
            "expected": "vegetal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 329,
            "output_tokens": 5,
            "latency_ms": 439.9,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "fruit",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "fruit",
            "expected": "vegetal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 329,
            "output_tokens": 4,
            "latency_ms": 585.9,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 329.0,
        "output_tokens_mean": 4.667,
        "latency_ms_mean": 567.9,
        "usd_sum": 5.4e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-007",
      "category": "sensory",
      "capability": "structural_vector",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "658c14cce434cc0a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nReturn the OGS structural attributes of a Brut Champagne on the 0-10 scale. Respond as a JSON object with keys body, acidity_perception, tannin, astringency, alcohol_heat, effervescence.\n\nINPUT:\n{\n  \"beverage\": \"Brut non-vintage Champagne\"\n}\n\nANSWER FORMAT: json\n\nRESPONSE JSON SCHEMA:\n{\n  \"type\": \"object\",\n  \"required\": [\n    \"body\",\n    \"acidity_perception\",\n    \"tannin\",\n    \"astringency\",\n    \"alcohol_heat\",\n    \"effervescence\"\n  ],\n  \"properties\": {\n    \"body\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"acidity_perception\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"tannin\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"astringency\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"alcohol_heat\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"effervescence\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    }\n  },\n  \"additionalProperties\": false\n}",
      "expected": {
        "values": {
          "body": 4.5,
          "acidity_perception": 8.0,
          "tannin": 0.5,
          "astringency": 1.0,
          "alcohol_heat": 3.5,
          "effervescence": 9.0
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"body\": 3.0,\n  \"acidity_perception\": 7.0,\n  \"tannin\": 0.5,\n  \"astringency\": 0.5,\n  \"alcohol_heat\": 3.0,\n  \"effervescence\": 9.0\n}",
          "compliant": true,
          "score": 0.7083,
          "detail": {
            "per_key": {
              "body": {
                "predicted": 3.0,
                "expected": 4.5,
                "score": 0.25
              },
              "acidity_perception": {
                "predicted": 7.0,
                "expected": 8.0,
                "score": 0.5
              },
              "tannin": {
                "predicted": 0.5,
                "expected": 0.5,
                "score": 1.0
              },
              "astringency": {
                "predicted": 0.5,
                "expected": 1.0,
                "score": 0.75
              },
              "alcohol_heat": {
                "predicted": 3.0,
                "expected": 3.5,
                "score": 0.75
              },
              "effervescence": {
                "predicted": 9.0,
                "expected": 9.0,
                "score": 1.0
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 593,
            "output_tokens": 69,
            "latency_ms": 1097.9,
            "usd": 5.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"body\": 3.5,\n  \"acidity_perception\": 7.5,\n  \"tannin\": 0.8,\n  \"astringency\": 1.0,\n  \"alcohol_heat\": 3.0,\n  \"effervescence\": 9.0\n}",
          "compliant": true,
          "score": 0.8083,
          "detail": {
            "per_key": {
              "body": {
                "predicted": 3.5,
                "expected": 4.5,
                "score": 0.5
              },
              "acidity_perception": {
                "predicted": 7.5,
                "expected": 8.0,
                "score": 0.75
              },
              "tannin": {
                "predicted": 0.8,
                "expected": 0.5,
                "score": 0.85
              },
              "astringency": {
                "predicted": 1.0,
                "expected": 1.0,
                "score": 1.0
              },
              "alcohol_heat": {
                "predicted": 3.0,
                "expected": 3.5,
                "score": 0.75
              },
              "effervescence": {
                "predicted": 9.0,
                "expected": 9.0,
                "score": 1.0
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 593,
            "output_tokens": 69,
            "latency_ms": 918.5,
            "usd": 5.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"body\": 2.5,\n  \"acidity_perception\": 7.5,\n  \"tannin\": 0.5,\n  \"astringency\": 0.5,\n  \"alcohol_heat\": 2.5,\n  \"effervescence\": 9.0\n}",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "per_key": {
              "body": {
                "predicted": 2.5,
                "expected": 4.5,
                "score": 0.0
              },
              "acidity_perception": {
                "predicted": 7.5,
                "expected": 8.0,
                "score": 0.75
              },
              "tannin": {
                "predicted": 0.5,
                "expected": 0.5,
                "score": 1.0
              },
              "astringency": {
                "predicted": 0.5,
                "expected": 1.0,
                "score": 0.75
              },
              "alcohol_heat": {
                "predicted": 2.5,
                "expected": 3.5,
                "score": 0.5
              },
              "effervescence": {
                "predicted": 9.0,
                "expected": 9.0,
                "score": 1.0
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 593,
            "output_tokens": 69,
            "latency_ms": 971.2,
            "usd": 5.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.7278,
      "score_summary": {
        "mean": 0.7278,
        "stdev": 0.0728,
        "sem": 0.042,
        "ci95_lo": 0.6454,
        "ci95_hi": 0.8101,
        "n": 3
      },
      "score_if_valid": 0.7278,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 593.0,
        "output_tokens_mean": 69.0,
        "latency_ms_mean": 995.867,
        "usd_sum": 0.000171
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-008",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "873c6c2f9608e43f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the bitter intensity of raw radicchio on the OGS 0-10 intensity scale. Respond with only a single decimal number.\n\nINPUT:\n{\n  \"ingredient\": \"raw radicchio\",\n  \"attribute\": \"bitter\"\n}\n\nANSWER FORMAT: integer_0_10",
      "expected": {
        "value": 6.5,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "7.5",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": 7.5,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 355,
            "output_tokens": 6,
            "latency_ms": 571.2,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "7.5",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": 7.5,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 355,
            "output_tokens": 6,
            "latency_ms": 625.9,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "8.0",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 8.0,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 355,
            "output_tokens": 6,
            "latency_ms": 703.5,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.2222,
      "score_summary": {
        "mean": 0.2222,
        "stdev": 0.1924,
        "sem": 0.1111,
        "ci95_lo": 0.0044,
        "ci95_hi": 0.44,
        "n": 3
      },
      "score_if_valid": 0.2222,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 355.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 633.533,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-009",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "523e04f39252cb4b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the umami intensity of a plain 10-minute cold-steep kombu dashi on the OGS 0-10 scale. Respond with only a single decimal number.\n\nINPUT:\n{\n  \"ingredient\": \"kombu dashi (cold steep 10 min, 10 g/L Hokkaido kombu)\",\n  \"attribute\": \"umami\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 6.5,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "6.5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 6.5,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 379,
            "output_tokens": 6,
            "latency_ms": 664.6,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "7.5",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": 7.5,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 379,
            "output_tokens": 6,
            "latency_ms": 889.3,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "7.8",
          "compliant": true,
          "score": 0.1333,
          "detail": {
            "predicted": 7.8,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 379,
            "output_tokens": 6,
            "latency_ms": 751.3,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.4889,
      "score_summary": {
        "mean": 0.4889,
        "stdev": 0.4538,
        "sem": 0.262,
        "ci95_lo": -0.0247,
        "ci95_hi": 1.0024,
        "n": 3
      },
      "score_if_valid": 0.4889,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 379.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 768.4,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-010",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "1813cae53ba17a84",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the bitter intensity of a square of unsweetened 85% cacao dark chocolate on the OGS 0-10 scale.\n\nINPUT:\n{\n  \"ingredient\": \"single-origin 85% dark chocolate\",\n  \"attribute\": \"bitter\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 6.5,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "8.6",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 8.6,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 357,
            "output_tokens": 6,
            "latency_ms": 1275.5,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "8.0",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 8.0,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 357,
            "output_tokens": 6,
            "latency_ms": 658.4,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "9.2",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 9.2,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 357,
            "output_tokens": 6,
            "latency_ms": 614.7,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 357.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 849.533,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-011",
      "category": "sensory",
      "capability": "structural_estimation",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "1886b2e009716d1d",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nEstimate the perceived acidity of freshly squeezed yuzu juice on the OGS 0-10 scale.\n\nINPUT:\n{\n  \"ingredient\": \"fresh yuzu juice\",\n  \"attribute\": \"acidity_perception\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 9.0,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "7.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 7.5,
            "expected": 9.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 330,
            "output_tokens": 6,
            "latency_ms": 505.7,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "7.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 7.5,
            "expected": 9.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 330,
            "output_tokens": 6,
            "latency_ms": 550.3,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "7.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 7.5,
            "expected": 9.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 330,
            "output_tokens": 6,
            "latency_ms": 553.0,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 330.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 536.333,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-012",
      "category": "sensory",
      "capability": "basic_taste_vector",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "8d40a83f1b501685",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nReturn the five basic tastes of fresh-squeezed Valencia orange juice on the OGS 0-10 scale as a JSON object with keys sweet, sour, salty, bitter, umami. Respond with ONLY the JSON.\n\nINPUT:\n{\n  \"ingredient\": \"freshly squeezed Valencia orange juice\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "values": {
          "sweet": 7.0,
          "sour": 5.5,
          "salty": 0.0,
          "bitter": 0.5,
          "umami": 0.0
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"sweet\": 7.8,\n  \"sour\": 6.5,\n  \"salty\": 0.4,\n  \"bitter\": 1.3,\n  \"umami\": 0.6\n}",
          "compliant": true,
          "score": 0.64,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 7.8,
                "expected": 7.0,
                "score": 0.6
              },
              "sour": {
                "predicted": 6.5,
                "expected": 5.5,
                "score": 0.5
              },
              "salty": {
                "predicted": 0.4,
                "expected": 0.0,
                "score": 0.8
              },
              "bitter": {
                "predicted": 1.3,
                "expected": 0.5,
                "score": 0.6
              },
              "umami": {
                "predicted": 0.6,
                "expected": 0.0,
                "score": 0.7
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 363,
            "output_tokens": 54,
            "latency_ms": 987.5,
            "usd": 4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\"sweet\":8.5,\"sour\":6.5,\"salty\":0.2,\"bitter\":1.2,\"umami\":0.3}",
          "compliant": true,
          "score": 0.63,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 8.5,
                "expected": 7.0,
                "score": 0.25
              },
              "sour": {
                "predicted": 6.5,
                "expected": 5.5,
                "score": 0.5
              },
              "salty": {
                "predicted": 0.2,
                "expected": 0.0,
                "score": 0.9
              },
              "bitter": {
                "predicted": 1.2,
                "expected": 0.5,
                "score": 0.65
              },
              "umami": {
                "predicted": 0.3,
                "expected": 0.0,
                "score": 0.85
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 363,
            "output_tokens": 38,
            "latency_ms": 988.2,
            "usd": 3.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\"sweet\":8.2,\"sour\":5.8,\"salty\":0.2,\"bitter\":0.6,\"umami\":0.4}",
          "compliant": true,
          "score": 0.78,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 8.2,
                "expected": 7.0,
                "score": 0.4
              },
              "sour": {
                "predicted": 5.8,
                "expected": 5.5,
                "score": 0.85
              },
              "salty": {
                "predicted": 0.2,
                "expected": 0.0,
                "score": 0.9
              },
              "bitter": {
                "predicted": 0.6,
                "expected": 0.5,
                "score": 0.95
              },
              "umami": {
                "predicted": 0.4,
                "expected": 0.0,
                "score": 0.8
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 363,
            "output_tokens": 38,
            "latency_ms": 723.0,
            "usd": 3.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.6833,
      "score_summary": {
        "mean": 0.6833,
        "stdev": 0.0839,
        "sem": 0.0484,
        "ci95_lo": 0.5884,
        "ci95_hi": 0.7782,
        "n": 3
      },
      "score_if_valid": 0.6833,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 363.0,
        "output_tokens_mean": 43.333,
        "latency_ms_mean": 899.567,
        "usd_sum": 0.000106
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-013",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "medium",
      "cuisine_region": "middle_eastern",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "691f59088f3e98f3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the sweet intensity of a fresh Medjool date on the OGS 0-10 scale.\n\nINPUT:\n{\n  \"ingredient\": \"Medjool date, soft and ripe\",\n  \"attribute\": \"sweet\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 9.0,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "9.0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 9.0,
            "expected": 9.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 351,
            "output_tokens": 6,
            "latency_ms": 795.0,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "9.0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 9.0,
            "expected": 9.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 351,
            "output_tokens": 6,
            "latency_ms": 619.1,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "9.5",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": 9.5,
            "expected": 9.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 351,
            "output_tokens": 6,
            "latency_ms": 625.3,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.8889,
      "score_summary": {
        "mean": 0.8889,
        "stdev": 0.1924,
        "sem": 0.1111,
        "ci95_lo": 0.6711,
        "ci95_hi": 1.1067,
        "n": 3
      },
      "score_if_valid": 0.8889,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 351.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 679.8,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-014",
      "category": "sensory",
      "capability": "dominant_aromatic_family",
      "difficulty": "medium",
      "cuisine_region": "scottish",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "c4a83a79265c0e9f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nIdentify the single dominant aromatic FAMILY (OGS vocabulary) of a heavily-peated Islay single malt (e.g. Ardbeg 10).\n\nINPUT:\n{\n  \"item\": \"heavily peated Islay single malt whisky, ~50 ppm phenols\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "wood"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "earth",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "earth",
            "expected": "wood",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 341,
            "output_tokens": 4,
            "latency_ms": 585.3,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "earth",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "earth",
            "expected": "wood",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 341,
            "output_tokens": 4,
            "latency_ms": 608.8,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "earth",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "earth",
            "expected": "wood",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 341,
            "output_tokens": 4,
            "latency_ms": 585.4,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 341.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 593.167,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-015",
      "category": "sensory",
      "capability": "structural_vector",
      "difficulty": "hard",
      "cuisine_region": "italian",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "66d64ca2254c09ee",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nReturn the OGS structural attributes of a 5-year-old Amarone della Valpolicella on the 0-10 scale. Respond with a JSON object with keys body, acidity_perception, tannin, astringency, alcohol_heat, effervescence. JSON only.\n\nINPUT:\n{\n  \"item\": \"Amarone della Valpolicella Classico, 5 years old, 16% abv\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "values": {
          "body": 9.0,
          "acidity_perception": 6.0,
          "tannin": 7.0,
          "astringency": 5.5,
          "alcohol_heat": 8.5,
          "effervescence": 0.0
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\"body\":9,\"acidity_perception\":4,\"tannin\":6,\"astringency\":5,\"alcohol_heat\":7,\"effervescence\":0}",
          "compliant": true,
          "score": 0.5833,
          "detail": {
            "per_key": {
              "body": {
                "predicted": 9,
                "expected": 9.0,
                "score": 1.0
              },
              "acidity_perception": {
                "predicted": 4,
                "expected": 6.0,
                "score": 0.0
              },
              "tannin": {
                "predicted": 6,
                "expected": 7.0,
                "score": 0.5
              },
              "astringency": {
                "predicted": 5,
                "expected": 5.5,
                "score": 0.75
              },
              "alcohol_heat": {
                "predicted": 7,
                "expected": 8.5,
                "score": 0.25
              },
              "effervescence": {
                "predicted": 0,
                "expected": 0.0,
                "score": 1.0
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 38,
            "latency_ms": 979.9,
            "usd": 3.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"body\": 9,\n  \"acidity_perception\": 4,\n  \"tannin\": 6,\n  \"astringency\": 5,\n  \"alcohol_heat\": 7,\n  \"effervescence\": 0\n}",
          "compliant": true,
          "score": 0.5833,
          "detail": {
            "per_key": {
              "body": {
                "predicted": 9,
                "expected": 9.0,
                "score": 1.0
              },
              "acidity_perception": {
                "predicted": 4,
                "expected": 6.0,
                "score": 0.0
              },
              "tannin": {
                "predicted": 6,
                "expected": 7.0,
                "score": 0.5
              },
              "astringency": {
                "predicted": 5,
                "expected": 5.5,
                "score": 0.75
              },
              "alcohol_heat": {
                "predicted": 7,
                "expected": 8.5,
                "score": 0.25
              },
              "effervescence": {
                "predicted": 0,
                "expected": 0.0,
                "score": 1.0
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 57,
            "latency_ms": 1005.0,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"body\": 9,\n  \"acidity_perception\": 4,\n  \"tannin\": 6,\n  \"astringency\": 5,\n  \"alcohol_heat\": 6,\n  \"effervescence\": 0\n}",
          "compliant": true,
          "score": 0.5417,
          "detail": {
            "per_key": {
              "body": {
                "predicted": 9,
                "expected": 9.0,
                "score": 1.0
              },
              "acidity_perception": {
                "predicted": 4,
                "expected": 6.0,
                "score": 0.0
              },
              "tannin": {
                "predicted": 6,
                "expected": 7.0,
                "score": 0.5
              },
              "astringency": {
                "predicted": 5,
                "expected": 5.5,
                "score": 0.75
              },
              "alcohol_heat": {
                "predicted": 6,
                "expected": 8.5,
                "score": 0.0
              },
              "effervescence": {
                "predicted": 0,
                "expected": 0.0,
                "score": 1.0
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 57,
            "latency_ms": 894.3,
            "usd": 4.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.5694,
      "score_summary": {
        "mean": 0.5694,
        "stdev": 0.024,
        "sem": 0.0139,
        "ci95_lo": 0.5423,
        "ci95_hi": 0.5966,
        "n": 3
      },
      "score_if_valid": 0.5694,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 370.0,
        "output_tokens_mean": 50.667,
        "latency_ms_mean": 959.733,
        "usd_sum": 0.000116
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-016",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "medium",
      "cuisine_region": "spanish",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "70318c6fd3b033d2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the umami intensity of 24-month acorn-finished jamón ibérico de bellota on the OGS 0-10 scale.\n\nINPUT:\n{\n  \"ingredient\": \"jamón ibérico de bellota, 24 months, acorn-finished\",\n  \"attribute\": \"umami\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 8.0,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "8.8",
          "compliant": true,
          "score": 0.4667,
          "detail": {
            "predicted": 8.8,
            "expected": 8.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 369,
            "output_tokens": 6,
            "latency_ms": 668.8,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "7.8",
          "compliant": true,
          "score": 0.8667,
          "detail": {
            "predicted": 7.8,
            "expected": 8.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 369,
            "output_tokens": 6,
            "latency_ms": 406.1,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "8.3",
          "compliant": true,
          "score": 0.8,
          "detail": {
            "predicted": 8.3,
            "expected": 8.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 369,
            "output_tokens": 6,
            "latency_ms": 516.8,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.7111,
      "score_summary": {
        "mean": 0.7111,
        "stdev": 0.2143,
        "sem": 0.1237,
        "ci95_lo": 0.4686,
        "ci95_hi": 0.9536,
        "n": 3
      },
      "score_if_valid": 0.7111,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 369.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 530.567,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-017",
      "category": "sensory",
      "capability": "dominant_aromatic_family",
      "difficulty": "medium",
      "cuisine_region": "thai",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "c3cedde89152ce76",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nIdentify the single dominant aromatic FAMILY of fresh Thai basil (horapa) leaves.\n\nINPUT:\n{\n  \"item\": \"fresh Thai basil (horapa), whole leaves\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "herbal"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "herbal",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herbal",
            "expected": "herbal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 320,
            "output_tokens": 5,
            "latency_ms": 696.3,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "herbal",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herbal",
            "expected": "herbal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 320,
            "output_tokens": 5,
            "latency_ms": 617.8,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "herbal",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herbal",
            "expected": "herbal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 320,
            "output_tokens": 5,
            "latency_ms": 560.4,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 320.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 624.833,
        "usd_sum": 5.4e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-018",
      "category": "sensory",
      "capability": "structural_estimation",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "7d31de11173a7fd2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nEstimate the body of a dry Provençal rosé (e.g. Côtes de Provence Grenache-led) on the OGS 0-10 scale.\n\nINPUT:\n{\n  \"item\": \"dry Provençal rosé, 12.5% abv, minimal skin contact\",\n  \"attribute\": \"body\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 3.5,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "5.0",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 5.0,
            "expected": 3.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 352,
            "output_tokens": 6,
            "latency_ms": 648.5,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "4.5",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": 4.5,
            "expected": 3.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 352,
            "output_tokens": 6,
            "latency_ms": 535.8,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "4.5",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": 4.5,
            "expected": 3.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 352,
            "output_tokens": 6,
            "latency_ms": 10733.8,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.2222,
      "score_summary": {
        "mean": 0.2222,
        "stdev": 0.1924,
        "sem": 0.1111,
        "ci95_lo": 0.0044,
        "ci95_hi": 0.44,
        "n": 3
      },
      "score_if_valid": 0.2222,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 352.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 3972.7,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-019",
      "category": "sensory",
      "capability": "basic_taste_vector",
      "difficulty": "hard",
      "cuisine_region": "korean",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "2e7df899a0148ebd",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nReturn the five basic tastes of well-fermented napa kimchi (6 weeks at 4°C after 2 days at room temp) as a JSON object.\n\nINPUT:\n{\n  \"ingredient\": \"napa kimchi, 6 weeks fermented, ripe\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "values": {
          "sweet": 1.0,
          "sour": 7.0,
          "salty": 6.0,
          "bitter": 1.5,
          "umami": 7.0
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"sweet\": 1.5,\n  \"sour\": 7.5,\n  \"salty\": 6.5,\n  \"bitter\": 1.0,\n  \"umami\": 8.0\n}",
          "compliant": true,
          "score": 0.7,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 1.5,
                "expected": 1.0,
                "score": 0.75
              },
              "sour": {
                "predicted": 7.5,
                "expected": 7.0,
                "score": 0.75
              },
              "salty": {
                "predicted": 6.5,
                "expected": 6.0,
                "score": 0.75
              },
              "bitter": {
                "predicted": 1.0,
                "expected": 1.5,
                "score": 0.75
              },
              "umami": {
                "predicted": 8.0,
                "expected": 7.0,
                "score": 0.5
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 354,
            "output_tokens": 54,
            "latency_ms": 995.1,
            "usd": 3.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"sweet\": 1.5,\n  \"sour\": 6.5,\n  \"salty\": 6.0,\n  \"bitter\": 1.0,\n  \"umami\": 7.5\n}",
          "compliant": true,
          "score": 0.8,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 1.5,
                "expected": 1.0,
                "score": 0.75
              },
              "sour": {
                "predicted": 6.5,
                "expected": 7.0,
                "score": 0.75
              },
              "salty": {
                "predicted": 6.0,
                "expected": 6.0,
                "score": 1.0
              },
              "bitter": {
                "predicted": 1.0,
                "expected": 1.5,
                "score": 0.75
              },
              "umami": {
                "predicted": 7.5,
                "expected": 7.0,
                "score": 0.75
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 354,
            "output_tokens": 54,
            "latency_ms": 914.4,
            "usd": 3.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"sweet\": 1.5,\n  \"sour\": 7.5,\n  \"salty\": 6.5,\n  \"bitter\": 1.0,\n  \"umami\": 8.5\n}",
          "compliant": true,
          "score": 0.65,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 1.5,
                "expected": 1.0,
                "score": 0.75
              },
              "sour": {
                "predicted": 7.5,
                "expected": 7.0,
                "score": 0.75
              },
              "salty": {
                "predicted": 6.5,
                "expected": 6.0,
                "score": 0.75
              },
              "bitter": {
                "predicted": 1.0,
                "expected": 1.5,
                "score": 0.75
              },
              "umami": {
                "predicted": 8.5,
                "expected": 7.0,
                "score": 0.25
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 354,
            "output_tokens": 54,
            "latency_ms": 971.9,
            "usd": 3.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.7167,
      "score_summary": {
        "mean": 0.7167,
        "stdev": 0.0764,
        "sem": 0.0441,
        "ci95_lo": 0.6302,
        "ci95_hi": 0.8031,
        "n": 3
      },
      "score_if_valid": 0.7167,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 354.0,
        "output_tokens_mean": 54.0,
        "latency_ms_mean": 960.467,
        "usd_sum": 0.000117
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-020",
      "category": "sensory",
      "capability": "structural_estimation",
      "difficulty": "medium",
      "cuisine_region": "portuguese",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "ca5b9ab85fe71acb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nEstimate the effervescence of a typical young Vinho Verde (traditional style with slight spritz) on the OGS 0-10 scale.\n\nINPUT:\n{\n  \"item\": \"young Vinho Verde, slightly spritzig\",\n  \"attribute\": \"effervescence\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 2.5,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "4.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 4.5,
            "expected": 2.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 344,
            "output_tokens": 6,
            "latency_ms": 712.3,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "3.5",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": 3.5,
            "expected": 2.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 344,
            "output_tokens": 6,
            "latency_ms": 580.8,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "4.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 4.5,
            "expected": 2.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 344,
            "output_tokens": 6,
            "latency_ms": 526.7,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.1111,
      "score_summary": {
        "mean": 0.1111,
        "stdev": 0.1924,
        "sem": 0.1111,
        "ci95_lo": -0.1067,
        "ci95_hi": 0.3289,
        "n": 3
      },
      "score_if_valid": 0.1111,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 344.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 606.6,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-001",
      "category": "structured_output",
      "capability": "schema_valid_ingredient",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.ingredient",
        "vocab.ingredient_categories",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "4523d68b7da194f7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS ingredient schema (`schema/ingredient.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/ingredient.schema.json\",\n  \"title\": \"OGS Ingredient\",\n  \"description\": \"An ingredient entity with sensory profile, category, and provenance metadata.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"category\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\",\n      \"description\": \"OGS identifier in the format ogs:<namespace>:ingredient:<id>\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\",\n      \"description\": \"OGS specification version (0.1.x or 0.2.x)\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\",\n      \"description\": \"Authoritative / endonym language of this entity's human-readable fields.\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      },\n      \"description\": \"Advisory list of languages covered by this entity's LString fields.\"\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"category\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"protein\",\n        \"vegetable\",\n        \"fruit\",\n        \"grain\",\n        \"dairy\",\n        \"fat\",\n        \"herb\",\n        \"spice\",\n        \"fungus\",\n        \"legume\",\n        \"nut\",\n        \"seed\",\n        \"sweetener\",\n        \"acid\",\n        \"salt\",\n        \"other\"\n      ]\n    },\n    \"subcategory\": {\n      \"type\": \"string\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"origin\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"region\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"season\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"terroir_notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document that describes 'fleur de sel' as an OGS ingredient. It MUST validate against the OGS ingredient schema (schema/ingredient.schema.json). Respond with ONLY the JSON object, no markdown fences.\n\nINPUT:\n{\n  \"ingredient_name\": \"Fleur de sel\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"fleur-de-sel\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/ingredient.schema.json",
        "required_fields": {
          "id": "ogs:core:ingredient:fleur-de-sel",
          "ogs_version": "0.1.0",
          "category": "salt"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:fleur-de-sel\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"fr\",\n  \"name\": \"Fleur de sel\",\n  \"description\": \"Fine, delicate sea-salt crystals harvested from the surface of evaporating seawater; prized for its purity and crunch, used as a finishing salt.\",\n  \"category\": \"salt\",\n  \"subcategory\": \"sea-salt\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0,\n      \"sour\": 0,\n      \"salty\": 7.5,\n      \"bitter\": 0,\n      \"umami\": 0.5\n    },\n    \"structural\": {\n      \"body\": 1.5,\n      \"acidity_perception\": 0,\n      \"tannin\": 0,\n      \"astringency\": 0,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"texture\": [\n      \"crisp\",\n      \"crunchy\",\n      \"flake-like\"\n    ],\n    \"finish_length\": 2.0,\n    \"complexity\": 1.5,\n    \"aromatics\": [\n      {\n        \"family\": \"marine\",\n        \"group\": \"salinity/brine\",\n        \"item\": \"sea-breeze\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 2.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"light mineral dust\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 1.0\n      }\n    ]\n  },\n  \"origin\": {\n    \"region\": \"coastal salt pans\",\n    \"season\": \"late spring to early autumn (varies by locale)\",\n    \"terroir_notes\": \"Surface-harvested flakes formed during calm, humid evaporation conditions.\"\n  },\n  \"notes\": \"Best used as a finishing salt to preserve flake integrity; dissolves quickly, delivering a bright salinity without harshness.\"\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:fleur-de-sel",
                "expected": "ogs:core:ingredient:fleur-de-sel",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "salt",
                "expected": "salt",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2513,
            "output_tokens": 456,
            "latency_ms": 3460.3,
            "usd": 0.000308,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:fleur-de-sel\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": {\n    \"en\": \"Fleur de sel\"\n  },\n  \"primary_language\": \"en\",\n  \"category\": \"salt\",\n  \"subcategory\": \"sea salt (flor de sal)\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0,\n      \"sour\": 0,\n      \"salty\": 8.5,\n      \"bitter\": 0,\n      \"umami\": 0.5\n    },\n    \"structural\": {\n      \"body\": 1,\n      \"acidity_perception\": 0,\n      \"tannin\": 0,\n      \"astringency\": 0,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"marine\",\n        \"group\": \"sea-breeze\",\n        \"item\": \"briny\",\n        \"intensity\": 4\n      }\n    ],\n    \"texture\": [\n      \"crystalline\",\n      \"delicate flakes\"\n    ],\n    \"finish_length\": 2,\n    \"complexity\": 2.5\n  },\n  \"origin\": {\n    \"region\": \"French Atlantic salt pans (traditional association)\"\n  },\n  \"notes\": \"Flaky sea salt collected at the surface of salt ponds; valued for mild salinity and clean, briny finish.\"\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:fleur-de-sel",
                "expected": "ogs:core:ingredient:fleur-de-sel",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "salt",
                "expected": "salt",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2513,
            "output_tokens": 329,
            "latency_ms": 2622.5,
            "usd": 0.000257,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:fleur-de-sel\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": \"Fleur de sel\",\n  \"category\": \"salt\",\n  \"subcategory\": \"finely flaky sea salt (surface salt)\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0,\n      \"sour\": 0,\n      \"salty\": 8.5,\n      \"bitter\": 0.2,\n      \"umami\": 0.4\n    },\n    \"structural\": {\n      \"body\": 0.3,\n      \"acidity_perception\": 0,\n      \"tannin\": 0,\n      \"astringency\": 0,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"marine\",\n        \"group\": \"saline/mineral\",\n        \"item\": \"sea-breeze brine\",\n        \"intensity\": 2.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"subtle clay/ground notes\",\n        \"intensity\": 1.2\n      }\n    ],\n    \"texture\": [\n      \"crystalline flakes\",\n      \"light crunch\"\n    ],\n    \"finish_length\": 2.0,\n    \"complexity\": 1.8\n  },\n  \"origin\": {\n    \"region\": \"coastal salt marshes (surface-harvested)\",\n    \"season\": \"late spring to early autumn (varies by location)\",\n    \"terroir_notes\": \"Flavor and grain size vary with local salinity, wind, sun exposure, and harvest conditions.\"\n  },\n  \"notes\": \"Collected from the top layer of sea-salt ponds near the end of evaporation; typically used as a finishing salt to add a clean, burst-like salinity and delicate crunch.\",\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"general culinary knowledge\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:fleur-de-sel",
                "expected": "ogs:core:ingredient:fleur-de-sel",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "salt",
                "expected": "salt",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2513,
            "output_tokens": 496,
            "latency_ms": 3597.3,
            "usd": 0.000324,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 2513.0,
        "output_tokens_mean": 427.0,
        "latency_ms_mean": 3226.7,
        "usd_sum": 0.000889
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-002",
      "category": "structured_output",
      "capability": "schema_valid_beverage",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.beverage",
        "vocab.beverage_types",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "367b58556573440d",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS beverage schema (`schema/beverage.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/beverage.schema.json\",\n  \"title\": \"OGS Beverage\",\n  \"description\": \"A beverage entity with type, structural attributes, sensory profile, and provenance.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"type\",\n    \"sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"type\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"wine\",\n        \"beer\",\n        \"spirit\",\n        \"cider\",\n        \"sake\",\n        \"other\"\n      ]\n    },\n    \"subtype\": {\n      \"type\": \"string\"\n    },\n    \"grape_varieties\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"type\": \"string\"\n      }\n    },\n    \"vintage\": {\n      \"type\": \"integer\"\n    },\n    \"region\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"appellation\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"producer\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"abv\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"residual_sugar\": {\n      \"type\": \"number\",\n      \"minimum\": 0\n    },\n    \"serve_temperature_c\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"min\": {\n          \"type\": \"number\"\n        },\n        \"max\": {\n          \"type\": \"number\"\n        }\n      },\n      \"additionalProperties\": false,\n      \"description\": \"Recommended serving temperature range in degrees Celsius (SI canonical).\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document describing a Sancerre (Sauvignon Blanc, Loire Valley) that validates against the OGS beverage schema. Respond with ONLY the JSON object, no markdown fences.\n\nINPUT:\n{\n  \"beverage_name\": \"Sancerre\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"sancerre-generic\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/beverage.schema.json",
        "required_fields": {
          "id": "ogs:core:beverage:sancerre-generic",
          "ogs_version": "0.1.0",
          "type": "wine"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:sancerre-generic\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Sancerre\",\n  \"type\": \"wine\",\n  \"subtype\": \"Sauvignon Blanc\",\n  \"region\": \"Loire Valley\",\n  \"appellation\": \"Sancerre\",\n  \"grape_varieties\": [\n    \"Sauvignon Blanc\"\n  ],\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0.5,\n      \"sour\": 3.5,\n      \"salty\": 1,\n      \"bitter\": 1,\n      \"umami\": 1\n    },\n    \"structural\": {\n      \"body\": 4.5,\n      \"acidity_perception\": 7.5,\n      \"tannin\": 0.3,\n      \"astringency\": 0.2,\n      \"alcohol_heat\": 2.5,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"stone fruit\",\n        \"item\": \"green apple\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"herbaceous\",\n        \"item\": \"gooseberry\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"white flowers\",\n        \"item\": \"acacia\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"wet stone\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 3\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"green notes\",\n        \"item\": \"cut grass\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 3\n      }\n    ],\n    \"texture\": [\n      \"crisp\",\n      \"dry\"\n    ],\n    \"finish_length\": 7,\n    \"complexity\": 6.5\n  },\n  \"notes\": \"Typical Sancerre Sauvignon Blanc: dry, high-acid, mineral-driven with citrus and gooseberry/herbaceous lift.\",\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"user-provided prompt\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:sancerre-generic",
                "expected": "ogs:core:beverage:sancerre-generic",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "wine",
                "expected": "wine",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2514,
            "output_tokens": 625,
            "latency_ms": 4835.5,
            "usd": 0.000376,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:sancerre-generic\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": {\n    \"labels\": [\n      {\n        \"value\": \"Sancerre\",\n        \"language\": \"en\",\n        \"role\": \"preferred\"\n      }\n    ],\n    \"primary_language\": \"en\"\n  },\n  \"type\": \"wine\",\n  \"subtype\": \"Sauvignon Blanc\",\n  \"region\": {\n    \"labels\": [\n      {\n        \"value\": \"Loire Valley\",\n        \"language\": \"en\",\n        \"role\": \"preferred\"\n      }\n    ],\n    \"primary_language\": \"en\"\n  },\n  \"appellation\": {\n    \"labels\": [\n      {\n        \"value\": \"Sancerre\",\n        \"language\": \"en\",\n        \"role\": \"preferred\"\n      }\n    ],\n    \"primary_language\": \"en\"\n  },\n  \"grape_varieties\": [\n    \"Sauvignon Blanc\"\n  ],\n  \"abv\": 12.5,\n  \"residual_sugar\": 2.0,\n  \"serve_temperature_c\": {\n    \"min\": 8,\n    \"max\": 12\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0.5,\n      \"sour\": 5.5,\n      \"salty\": 1.0,\n      \"bitter\": 1.0,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 5.0,\n      \"acidity_perception\": 7.5,\n      \"tannin\": 0.2,\n      \"astringency\": 0.3,\n      \"alcohol_heat\": 2.5,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon zest\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"stone fruit\",\n        \"item\": \"pear\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"vegetal herbs\",\n        \"item\": \"gooseberry\",\n        \"intensity\": 4.8\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"white flowers\",\n        \"item\": \"elderflower\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"wet limestone\",\n        \"intensity\": 4.0\n      }\n    ],\n    \"texture\": [\n      \"crisp\",\n      \"lean\",\n      \"refreshing\"\n    ],\n    \"finish_length\": 7.0,\n    \"complexity\": 6.0\n  },\n  \"primary_language\": \"en\",\n  \"available_languages\": [\n    \"en\"\n  ],\n  \"metadata\": {\n    \"source\": \"user-provided generic description\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:sancerre-generic",
                "expected": "ogs:core:beverage:sancerre-generic",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "wine",
                "expected": "wine",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2514,
            "output_tokens": 696,
            "latency_ms": 5208.3,
            "usd": 0.000404,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:sancerre-generic\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": {\n    \"primary_language\": \"en\",\n    \"labels\": [\n      {\n        \"value\": \"Sancerre\",\n        \"language\": \"en\",\n        \"role\": \"preferred\"\n      }\n    ]\n  },\n  \"type\": \"wine\",\n  \"subtype\": \"Sauvignon Blanc (Loire Valley)\",\n  \"grape_varieties\": [\n    \"Sauvignon Blanc\"\n  ],\n  \"region\": {\n    \"primary_language\": \"en\",\n    \"labels\": [\n      {\n        \"value\": \"Loire Valley\",\n        \"language\": \"en\",\n        \"role\": \"preferred\"\n      }\n    ]\n  },\n  \"appellation\": {\n    \"primary_language\": \"en\",\n    \"labels\": [\n      {\n        \"value\": \"Sancerre\",\n        \"language\": \"en\",\n        \"role\": \"preferred\"\n      }\n    ]\n  },\n  \"abv\": 12.5,\n  \"serve_temperature_c\": {\n    \"min\": 8,\n    \"max\": 12\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0.8,\n      \"sour\": 4.8,\n      \"salty\": 1.6,\n      \"bitter\": 1.2,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 4.2,\n      \"acidity_perception\": 7.6,\n      \"tannin\": 0.6,\n      \"astringency\": 1.0,\n      \"alcohol_heat\": 2.0,\n      \"effervescence\": 0.2\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.8\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"grapefruit\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.9\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green notes\",\n        \"item\": \"fresh-cut grass\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 3.9\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"white flowers\",\n        \"item\": \"elderflower\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 3.2\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"flint\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 4.6\n      }\n    ],\n    \"texture\": [\n      \"lean\",\n      \"crisp\"\n    ],\n    \"finish_length\": 7.0,\n    \"complexity\": 6.1\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:sancerre-generic",
                "expected": "ogs:core:beverage:sancerre-generic",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "wine",
                "expected": "wine",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2514,
            "output_tokens": 670,
            "latency_ms": 5229.5,
            "usd": 0.000394,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 2514.0,
        "output_tokens_mean": 663.667,
        "latency_ms_mean": 5091.1,
        "usd_sum": 0.001174
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-003",
      "category": "structured_output",
      "capability": "schema_valid_pairing",
      "difficulty": "expert",
      "cuisine_region": "french",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.pairing",
        "vocab.match_types",
        "vocab.explanation_codes",
        "vocab.grade_scale",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "2740dc70d150e12a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS pairing schema (`schema/pairing.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/pairing.schema.json\",\n  \"title\": \"OGS Pairing\",\n  \"description\": \"A scored, explained pairing between a dish and a beverage.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"dish_ref\",\n    \"beverage_ref\",\n    \"match_types\",\n    \"overall_score\",\n    \"grade\",\n    \"explanation_codes\",\n    \"confidence\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:pairing:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"dish_ref\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:dish:[a-z0-9-]+$\"\n    },\n    \"beverage_ref\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"match_types\": {\n      \"type\": \"array\",\n      \"minItems\": 1,\n      \"items\": {\n        \"$ref\": \"#/$defs/match_type_entry\"\n      }\n    },\n    \"overall_score\": {\n      \"type\": \"integer\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"grade\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"A+\",\n        \"A\",\n        \"B+\",\n        \"B\",\n        \"C+\",\n        \"C\",\n        \"D\",\n        \"F\"\n      ]\n    },\n    \"explanation_codes\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/explanation_code_entry\"\n      }\n    },\n    \"risks\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/risk_entry\"\n      }\n    },\n    \"confidence\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 1\n    },\n    \"evidence\": {\n      \"$ref\": \"#/$defs/evidence\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"match_type_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"type\",\n        \"score\"\n      ],\n      \"properties\": {\n        \"type\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"harmony\",\n            \"contrast\",\n            \"bridge\",\n            \"intensity_alignment\",\n            \"structural_balance\",\n            \"refresh\"\n          ]\n        },\n        \"score\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 100\n        },\n        \"description\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"primary\": {\n          \"type\": \"boolean\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"explanation_code_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"code\"\n      ],\n      \"properties\": {\n        \"code\": {\n          \"type\": \"string\"\n        },\n        \"weight\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 1\n        },\n        \"detail\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"risk_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"code\",\n        \"severity\"\n      ],\n      \"properties\": {\n        \"code\": {\n          \"type\": \"string\"\n        },\n        \"severity\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"low\",\n            \"medium\",\n            \"high\"\n          ]\n        },\n        \"description\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"mitigation\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"evidence\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"method\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"expert_panel\",\n            \"instrumental\",\n            \"computed\",\n            \"estimated\"\n          ]\n        },\n        \"sources\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n### OGS Explanation Codes\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce an OGS pairing JSON document for the described match. It MUST validate against the OGS pairing schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"pairing_slug\": \"brie-champagne\",\n  \"dish_ref\": \"ogs:core:dish:brie-on-baguette\",\n  \"beverage_ref\": \"ogs:core:beverage:brut-champagne-nv\",\n  \"intent\": \"A classic refresh pairing driven by effervescence and acidity against creamy brie.\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/pairing.schema.json",
        "required_fields": {
          "id": "ogs:core:pairing:brie-champagne",
          "ogs_version": "0.1.0",
          "dish_ref": "ogs:core:dish:brie-on-baguette",
          "beverage_ref": "ogs:core:beverage:brut-champagne-nv"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:brie-champagne\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": {\n    \"en\": \"Brie & Brut Champagne (Refresh by Effervescence)\"\n  },\n  \"dish_ref\": \"ogs:core:dish:brie-on-baguette\",\n  \"beverage_ref\": \"ogs:core:beverage:brut-champagne-nv\",\n  \"match_types\": [\n    {\n      \"type\": \"refresh\",\n      \"score\": 92,\n      \"description\": {\n        \"en\": \"Carbonation and crispness cleanse the palate between bites of creamy brie.\"\n      },\n      \"primary\": true\n    },\n    {\n      \"type\": \"harmony\",\n      \"score\": 82,\n      \"description\": {\n        \"en\": \"Bready, creamy dairy tones align with the wine’s refined, toasty aromatic profile.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 86,\n      \"description\": {\n        \"en\": \"The beverage’s acidity and body balance the dish’s richness without dulling it.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"intensity_alignment\",\n      \"score\": 78,\n      \"description\": {\n        \"en\": \"Overall intensity is well-matched, keeping the cheese flavor present while the wine stays crisp.\"\n      },\n      \"primary\": false\n    }\n  ],\n  \"overall_score\": 88,\n  \"grade\": \"A\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"REFRESH_EFFERVESCE\",\n      \"weight\": 0.45,\n      \"detail\": {\n        \"en\": \"Champagne bubbles physically scrub the palate, preventing the brie’s creaminess from becoming heavy.\"\n      }\n    },\n    {\n      \"code\": \"REFRESH_ACID\",\n      \"weight\": 0.35,\n      \"detail\": {\n        \"en\": \"Brut acidity cuts through fat in the brie, refreshing the mouth after each bite.\"\n      }\n    },\n    {\n      \"code\": \"STRUCT_ACIDITY_BALANCE\",\n      \"weight\": 0.2,\n      \"detail\": {\n        \"en\": \"The wine’s acidity restores balance against the dish’s richness and salt.\"\n      }\n    }\n  ],\n  \"confidence\": 0.74,\n  \"evidence\": {\n    \"method\": \"estimated\",\n    \"sources\": [],\n    \"notes\": {\n      \"en\": \"General pairing logic: brut Champagne acidity and effervescence are well-suited to soft, creamy, fat-rich cheeses like brie.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Serve well-chilled for maximum effervescence and acidity impact; keep brut (not sweet) to avoid clashing with dairy richness.\"\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"user-provided intent; OGS pairing model rationale\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:pairing:brie-champagne",
                "expected": "ogs:core:pairing:brie-champagne",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:brie-on-baguette",
                "expected": "ogs:core:dish:brie-on-baguette",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:brut-champagne-nv",
                "expected": "ogs:core:beverage:brut-champagne-nv",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3379,
            "output_tokens": 666,
            "latency_ms": 4463.2,
            "usd": 0.000435,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:brie-champagne\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Brie on baguette × Brut Champagne (NV)\",\n  \"dish_ref\": \"ogs:core:dish:brie-on-baguette\",\n  \"beverage_ref\": \"ogs:core:beverage:brut-champagne-nv\",\n  \"match_types\": [\n    {\n      \"type\": \"refresh\",\n      \"score\": 92,\n      \"description\": {\n        \"en\": \"Champagne effervescence and acidity cleanse the palate against creamy brie.\"\n      },\n      \"primary\": true\n    },\n    {\n      \"type\": \"contrast\",\n      \"score\": 80,\n      \"description\": {\n        \"en\": \"Crisp, dry acidity contrasts the richness and softness of baked/served brie.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 78,\n      \"description\": {\n        \"en\": \"The beverage body and acidity are aligned to keep the pairing balanced with a creamy texture.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"harmony\",\n      \"score\": 72,\n      \"description\": {\n        \"en\": \"Overall flavor cohesion is supported by brioche/cheese richness fitting well with Champagne’s yeasty-bready character.\"\n      },\n      \"primary\": false\n    }\n  ],\n  \"overall_score\": 90,\n  \"grade\": \"A\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"REFRESH_EFFERVESCE\",\n      \"weight\": 0.55,\n      \"detail\": {\n        \"en\": \"Carbonation helps scrub creamy fat between bites.\"\n      }\n    },\n    {\n      \"code\": \"REFRESH_ACID\",\n      \"weight\": 0.35,\n      \"detail\": {\n        \"en\": \"Brut acidity cuts through brie’s richness and soft texture.\"\n      }\n    },\n    {\n      \"code\": \"STRUCT_ACIDITY_BALANCE\",\n      \"weight\": 0.1,\n      \"detail\": {\n        \"en\": \"Acidity maintains freshness rather than letting the palate feel heavy.\"\n      }\n    }\n  ],\n  \"confidence\": 0.78,\n  \"evidence\": {\n    \"method\": \"expert_panel\",\n    \"sources\": [\n      \"Classical pairing principles: brut sparkling acidity/effervescence with creamy cheeses\"\n    ],\n    \"notes\": {\n      \"en\": \"Champagne’s dry profile (brut) reduces sweetness conflict and emphasizes palate-cleansing.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Serve chilled; keep the Champagne dry (brut) to maintain lift over creamy brie.\"\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"ogs:benchmark-task\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:pairing:brie-champagne",
                "expected": "ogs:core:pairing:brie-champagne",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:brie-on-baguette",
                "expected": "ogs:core:dish:brie-on-baguette",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:brut-champagne-nv",
                "expected": "ogs:core:beverage:brut-champagne-nv",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3379,
            "output_tokens": 662,
            "latency_ms": 4377.3,
            "usd": 0.000434,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:brie-on-baguette-brut-champagne-nv\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": {\n    \"en\": \"Brie on Baguette with Brut Champagne (NV)\"\n  },\n  \"description\": {\n    \"en\": \"A classic refresh pairing where Champagne’s effervescence and acidity cut through creamy brie while preserving aromatic lift.\"\n  },\n  \"dish_ref\": \"ogs:core:dish:brie-on-baguette\",\n  \"beverage_ref\": \"ogs:core:beverage:brut-champagne-nv\",\n  \"match_types\": [\n    {\n      \"type\": \"refresh\",\n      \"score\": 92,\n      \"description\": {\n        \"en\": \"Carbonation and acidity cleanse the palate and refresh between bites of rich, creamy cheese.\"\n      },\n      \"primary\": true\n    },\n    {\n      \"type\": \"contrast\",\n      \"score\": 80,\n      \"description\": {\n        \"en\": \"Bright acidity and crisp bubbles contrast the dish’s richness and creaminess.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"harmony\",\n      \"score\": 72,\n      \"description\": {\n        \"en\": \"Overall flavor compatibility: subtle bread/yeast notes and dairy-friendly character support the cheese’s creamy profile.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 78,\n      \"description\": {\n        \"en\": \"The beverage’s acidity and lively body keep mouthfeel balanced against the brie’s fat.\"\n      },\n      \"primary\": false\n    }\n  ],\n  \"overall_score\": 88,\n  \"grade\": \"A\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"REFRESH_EFFERVESCE\",\n      \"weight\": 0.95,\n      \"detail\": {\n        \"en\": \"Champagne bubbles scrub fat and cream from the palate between bites.\"\n      }\n    },\n    {\n      \"code\": \"REFRESH_ACID\",\n      \"weight\": 0.9,\n      \"detail\": {\n        \"en\": \"High acidity lifts and refreshes the brie’s richness.\"\n      }\n    },\n    {\n      \"code\": \"STRUCT_ACIDITY_BALANCE\",\n      \"weight\": 0.7,\n      \"detail\": {\n        \"en\": \"Acidity helps rebalance the dish’s creamy, fatty mouthfeel.\"\n      }\n    }\n  ],\n  \"confidence\": 0.78,\n  \"evidence\": {\n    \"method\": \"estimated\",\n    \"sources\": [\n      \"General pairing principles for brut sparkling wines with creamy, high-fat cheeses\"\n    ],\n    \"notes\": {\n      \"en\": \"Effervescence and acidity are expected to be the dominant drivers for brie.\"\n    }\n  },\n  \"risks\": [],\n  \"notes\": {\n    \"en\": \"Best served well-chilled; brut style supports the refresh effect without adding noticeable sweetness.\"\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"pairing_task\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:pairing:brie-on-baguette-brut-champagne-nv",
                "expected": "ogs:core:pairing:brie-champagne",
                "match": false
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:brie-on-baguette",
                "expected": "ogs:core:dish:brie-on-baguette",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:brut-champagne-nv",
                "expected": "ogs:core:beverage:brut-champagne-nv",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3379,
            "output_tokens": 693,
            "latency_ms": 4348.1,
            "usd": 0.000446,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.8333,
      "score_summary": {
        "mean": 0.8333,
        "stdev": 0.2887,
        "sem": 0.1667,
        "ci95_lo": 0.5067,
        "ci95_hi": 1.16,
        "n": 3
      },
      "score_if_valid": 0.8333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3379.0,
        "output_tokens_mean": 673.667,
        "latency_ms_mean": 4396.2,
        "usd_sum": 0.001315
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-004",
      "category": "structured_output",
      "capability": "sensory_profile_json",
      "difficulty": "medium",
      "cuisine_region": "indian",
      "scorer": "json_match",
      "context_needs": [
        "scales.basic_tastes",
        "scales.structural",
        "vocab.aromatic_families",
        "vocab.qualifiers"
      ],
      "prompt_template_hash": "9bd6860a8c5c69ec",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Aromatic qualifiers\n\nfresh, dried, cooked, candied, fermented, zest, overripe\n\n---\n\nProduce a JSON object representing the sensory profile for ripe mango. The object must have keys basic_tastes (with sweet, sour, salty, bitter, umami), structural (with body, acidity_perception, tannin, astringency, alcohol_heat, effervescence), and aromatics (array of entries with family, group, item, intensity). All intensities on 0-10. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"item\": \"ripe Alphonso mango, peak ripeness\"\n}\n\nANSWER FORMAT: json\n\nRESPONSE JSON SCHEMA:\n{\n  \"type\": \"object\",\n  \"required\": [\n    \"basic_tastes\",\n    \"structural\",\n    \"aromatics\"\n  ],\n  \"properties\": {\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"sweet\",\n        \"sour\",\n        \"salty\",\n        \"bitter\",\n        \"umami\"\n      ]\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"body\",\n        \"acidity_perception\",\n        \"tannin\",\n        \"astringency\",\n        \"alcohol_heat\",\n        \"effervescence\"\n      ]\n    },\n    \"aromatics\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"type\": \"object\",\n        \"required\": [\n          \"family\",\n          \"group\",\n          \"item\",\n          \"intensity\"\n        ]\n      }\n    }\n  }\n}",
      "expected": {
        "value": {
          "basic_tastes": {
            "sweet": 8.5,
            "sour": 3.0,
            "salty": 0,
            "bitter": 0.5,
            "umami": 0.5
          },
          "structural": {
            "body": 5.5,
            "acidity_perception": 3.0,
            "tannin": 0,
            "astringency": 0.5,
            "alcohol_heat": 0,
            "effervescence": 0
          }
        },
        "required_keys": [
          "basic_tastes.sweet",
          "basic_tastes.sour",
          "structural.body",
          "structural.acidity_perception",
          "structural.tannin"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 9.2,\n    \"sour\": 1.8,\n    \"salty\": 0.2,\n    \"bitter\": 0.3,\n    \"umami\": 0.6\n  },\n  \"structural\": {\n    \"body\": 7.8,\n    \"acidity_perception\": 2.0,\n    \"tannin\": 0.1,\n    \"astringency\": 0.2,\n    \"alcohol_heat\": 0.0,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": [\n    {\n      \"family\": \"fruit\",\n      \"group\": \"tropical\",\n      \"item\": \"ripe mango\",\n      \"intensity\": 9.4\n    },\n    {\n      \"family\": \"confection\",\n      \"group\": \"candy-like\",\n      \"item\": \"caramelized sweetness\",\n      \"intensity\": 4.6\n    },\n    {\n      \"family\": \"dairy\",\n      \"group\": \"creamy\",\n      \"item\": \"creamy ripe note\",\n      \"intensity\": 2.8\n    },\n    {\n      \"family\": \"floral\",\n      \"group\": \"soft blossom\",\n      \"item\": \"mango blossom\",\n      \"intensity\": 2.1\n    },\n    {\n      \"family\": \"vegetal\",\n      \"group\": \"overripe\",\n      \"item\": \"overripe softness\",\n      \"intensity\": 2.9\n    }\n  ]\n}",
          "compliant": true,
          "score": 0.8,
          "detail": {
            "per_key": {
              "basic_tastes.sweet": {
                "predicted": 9.2,
                "expected": 8.5,
                "match": true
              },
              "basic_tastes.sour": {
                "predicted": 1.8,
                "expected": 3.0,
                "match": true
              },
              "structural.body": {
                "predicted": 7.8,
                "expected": 5.5,
                "match": false
              },
              "structural.acidity_perception": {
                "predicted": 2.0,
                "expected": 3.0,
                "match": true
              },
              "structural.tannin": {
                "predicted": 0.1,
                "expected": 0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 712,
            "output_tokens": 343,
            "latency_ms": 2907.3,
            "usd": 0.000173,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 9.2,\n    \"sour\": 3.8,\n    \"salty\": 0.2,\n    \"bitter\": 0.4,\n    \"umami\": 1.2\n  },\n  \"structural\": {\n    \"body\": 7.6,\n    \"acidity_perception\": 3.9,\n    \"tannin\": 0.3,\n    \"astringency\": 0.2,\n    \"alcohol_heat\": 0.0,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": [\n    {\n      \"family\": \"fruit\",\n      \"group\": \"tropical\",\n      \"item\": \"ripe mango\",\n      \"intensity\": 9.3\n    },\n    {\n      \"family\": \"confection\",\n      \"group\": \"candy/ripe sweetness\",\n      \"item\": \"caramelized sweetness\",\n      \"intensity\": 4.2\n    },\n    {\n      \"family\": \"floral\",\n      \"group\": \"aromatic\",\n      \"item\": \"jasmine-like\",\n      \"intensity\": 2.6\n    },\n    {\n      \"family\": \"vegetal\",\n      \"group\": \"soft green\",\n      \"item\": \"mellow green note\",\n      \"intensity\": 1.8\n    },\n    {\n      \"family\": \"herbal\",\n      \"group\": \"fresh leaf\",\n      \"item\": \"light green herb\",\n      \"intensity\": 1.2\n    }\n  ]\n}",
          "compliant": true,
          "score": 0.8,
          "detail": {
            "per_key": {
              "basic_tastes.sweet": {
                "predicted": 9.2,
                "expected": 8.5,
                "match": true
              },
              "basic_tastes.sour": {
                "predicted": 3.8,
                "expected": 3.0,
                "match": true
              },
              "structural.body": {
                "predicted": 7.6,
                "expected": 5.5,
                "match": false
              },
              "structural.acidity_perception": {
                "predicted": 3.9,
                "expected": 3.0,
                "match": true
              },
              "structural.tannin": {
                "predicted": 0.3,
                "expected": 0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 712,
            "output_tokens": 344,
            "latency_ms": 2391.7,
            "usd": 0.000173,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 9.2,\n    \"sour\": 2.0,\n    \"salty\": 0.3,\n    \"bitter\": 0.4,\n    \"umami\": 1.2\n  },\n  \"structural\": {\n    \"body\": 7.6,\n    \"acidity_perception\": 2.4,\n    \"tannin\": 0.2,\n    \"astringency\": 0.3,\n    \"alcohol_heat\": 0.0,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": [\n    {\n      \"family\": \"fruit\",\n      \"group\": \"tropical\",\n      \"item\": \"mango\",\n      \"intensity\": 9.3\n    },\n    {\n      \"family\": \"fruit\",\n      \"group\": \"ripe\",\n      \"item\": \"overripe\",\n      \"intensity\": 2.2\n    },\n    {\n      \"family\": \"floral\",\n      \"group\": \"perfume\",\n      \"item\": \"fresh\",\n      \"intensity\": 3.1\n    },\n    {\n      \"family\": \"confection\",\n      \"group\": \"candy-like\",\n      \"item\": \"candied\",\n      \"intensity\": 2.0\n    },\n    {\n      \"family\": \"vegetal\",\n      \"group\": \"green to ripe transition\",\n      \"item\": \"cooked\",\n      \"intensity\": 1.6\n    },\n    {\n      \"family\": \"dairy\",\n      \"group\": \"creamy note\",\n      \"item\": \"fermented\",\n      \"intensity\": 1.1\n    }\n  ]\n}",
          "compliant": true,
          "score": 0.8,
          "detail": {
            "per_key": {
              "basic_tastes.sweet": {
                "predicted": 9.2,
                "expected": 8.5,
                "match": true
              },
              "basic_tastes.sour": {
                "predicted": 2.0,
                "expected": 3.0,
                "match": true
              },
              "structural.body": {
                "predicted": 7.6,
                "expected": 5.5,
                "match": false
              },
              "structural.acidity_perception": {
                "predicted": 2.4,
                "expected": 3.0,
                "match": true
              },
              "structural.tannin": {
                "predicted": 0.2,
                "expected": 0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 712,
            "output_tokens": 374,
            "latency_ms": 2766.3,
            "usd": 0.000185,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.8,
      "score_summary": {
        "mean": 0.8,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.8,
        "ci95_hi": 0.8,
        "n": 3
      },
      "score_if_valid": 0.8,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 712.0,
        "output_tokens_mean": 353.667,
        "latency_ms_mean": 2688.433,
        "usd_sum": 0.000531
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-005",
      "category": "structured_output",
      "capability": "schema_valid_dish",
      "difficulty": "expert",
      "cuisine_region": "french",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.dish",
        "vocab.component_roles",
        "vocab.cooking_methods",
        "vocab.courses",
        "vocab.service_temperatures",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "9caf9bc474b2464d",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS dish schema (`schema/dish.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/dish.schema.json\",\n  \"title\": \"OGS Dish\",\n  \"description\": \"A composed dish with components, cooking methods, and overall sensory profile.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"components\",\n    \"overall_sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:dish:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"cuisine\": {\n      \"type\": \"string\",\n      \"description\": \"Free-text cuisine name (v0.1) OR a cuisine entity reference matching ogs:<ns>:cuisine:<id> (v0.2+).\"\n    },\n    \"course\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"amuse\",\n        \"appetizer\",\n        \"soup\",\n        \"salad\",\n        \"fish\",\n        \"main\",\n        \"cheese\",\n        \"dessert\",\n        \"petit_four\"\n      ]\n    },\n    \"service_temperature\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"frozen\",\n        \"cold\",\n        \"cool\",\n        \"room\",\n        \"warm\",\n        \"hot\",\n        \"very_hot\"\n      ]\n    },\n    \"components\": {\n      \"type\": \"array\",\n      \"minItems\": 1,\n      \"items\": {\n        \"$ref\": \"#/$defs/component\"\n      }\n    },\n    \"overall_sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"component\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"role\",\n        \"ingredient_ref\"\n      ],\n      \"properties\": {\n        \"role\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"main_protein\",\n            \"main_vegetable\",\n            \"starch\",\n            \"sauce\",\n            \"fat_component\",\n            \"acid_component\",\n            \"aromatic_garnish\",\n            \"seasoning\",\n            \"textural_element\"\n          ]\n        },\n        \"ingredient_ref\": {\n          \"type\": \"string\",\n          \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\"\n        },\n        \"cooking_method\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"raw\",\n            \"seared\",\n            \"grilled\",\n            \"roasted\",\n            \"braised\",\n            \"poached\",\n            \"steamed\",\n            \"fried\",\n            \"confit\",\n            \"smoked\",\n            \"fermented\",\n            \"cured\",\n            \"pickled\",\n            \"dehydrated\",\n            \"sous_vide\",\n            \"blanched\",\n            \"sauteed\",\n            \"baked\",\n            \"charred\",\n            \"emulsified\"\n          ]\n        },\n        \"maillard_index\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 5\n        },\n        \"reduction_level\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 5\n        },\n        \"proportion\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"dominant\",\n            \"significant\",\n            \"accent\",\n            \"trace\"\n          ]\n        },\n        \"sensory_contribution\": {\n          \"type\": \"object\",\n          \"properties\": {\n            \"basic_tastes\": {\n              \"$ref\": \"#/$defs/basic_tastes\"\n            },\n            \"aromatics\": {\n              \"type\": \"array\",\n              \"items\": {\n                \"$ref\": \"#/$defs/aromatic_entry\"\n              }\n            },\n            \"texture\": {\n              \"type\": \"array\",\n              \"items\": {\n                \"type\": \"string\"\n              }\n            }\n          },\n          \"additionalProperties\": true\n        },\n        \"notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Service temperatures\n\nfrozen, cold, cool, room, warm, hot, very_hot\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce an OGS dish JSON document for 'moules marinieres' (mussels steamed with white wine, shallots, parsley, butter). It must validate against the OGS dish schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"dish_slug\": \"moules-marinieres\",\n  \"cuisine\": \"french\",\n  \"course\": \"main\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/dish.schema.json",
        "required_fields": {
          "id": "ogs:core:dish:moules-marinieres",
          "ogs_version": "0.1.0"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:moules-marinieres\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"moules marinieres\",\n  \"description\": \"Steamed mussels served in a white-wine broth with shallots, parsley, and butter.\",\n  \"cuisine\": \"french\",\n  \"course\": \"main\",\n  \"service_temperature\": \"hot\",\n  \"components\": [\n    {\n      \"role\": \"main_protein\",\n      \"ingredient_ref\": \"ogs:core:ingredient:mussels\",\n      \"cooking_method\": \"steamed\",\n      \"proportion\": \"dominant\",\n      \"maillard_index\": 0,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"umami\": 5.5,\n          \"salty\": 3.5,\n          \"sweet\": 0.5,\n          \"sour\": 0.5,\n          \"bitter\": 0.2\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"marine\",\n            \"group\": \"mussel\",\n            \"item\": \"briny-salt\",\n            \"intensity\": 5.0\n          }\n        ],\n        \"texture\": [\n          \"tender\",\n          \"juicy\"\n        ]\n      }\n    },\n    {\n      \"role\": \"sauce\",\n      \"ingredient_ref\": \"ogs:core:ingredient:white-wine-broth\",\n      \"cooking_method\": \"braised\",\n      \"proportion\": \"significant\",\n      \"maillard_index\": 0,\n      \"reduction_level\": 2,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"umami\": 3.0,\n          \"salty\": 2.5,\n          \"sour\": 3.5,\n          \"sweet\": 1.5,\n          \"bitter\": 0.8\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"fruit\",\n            \"group\": \"wine\",\n            \"item\": \"citrus-grapey\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 3.5\n          },\n          {\n            \"family\": \"herbal\",\n            \"group\": \"shallot-wine\",\n            \"item\": \"onion-sweet\",\n            \"intensity\": 2.8\n          }\n        ],\n        \"texture\": [\n          \"silky\"\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:shallots\",\n      \"cooking_method\": \"sauteed\",\n      \"proportion\": \"significant\",\n      \"maillard_index\": 2,\n      \"reduction_level\": 3,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 3.8,\n          \"sour\": 1.0,\n          \"salty\": 1.5,\n          \"bitter\": 0.8,\n          \"umami\": 1.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"savory\",\n            \"group\": \"allium\",\n            \"item\": \"shallot-sweet\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 4.0\n          }\n        ],\n        \"texture\": [\n          \"soft\"\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:parsley\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"maillard_index\": 0,\n      \"reduction_level\": 0,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.2,\n          \"sour\": 0.4,\n          \"salty\": 0.3,\n          \"bitter\": 1.8,\n          \"umami\": 0.6\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"fresh-herbs\",\n            \"item\": \"parsley-green\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 4.2\n          }\n        ],\n        \"texture\": [\n          \"fresh\"\n        ]\n      }\n    },\n    {\n      \"role\": \"fat_component\",\n      \"ingredient_ref\": \"ogs:core:ingredient:butter\",\n      \"cooking_method\": \"emulsified\",\n      \"proportion\": \"significant\",\n      \"maillard_index\": 0,\n      \"reduction_level\": 1,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.8,\n          \"sour\": 0.2,\n          \"salty\": 2.2,\n          \"bitter\": 0.2,\n          \"umami\": 1.2\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"dairy\",\n            \"group\": \"buttery\",\n            \"item\": \"creamy-dairy\",\n            \"intensity\": 3.8\n          }\n        ],\n        \"texture\": [\n          \"emulsified-silky\"\n        ]\n      }\n    }\n  ],\n  \"overall_sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.2,\n      \"sour\": 3.2,\n      \"salty\": 3.4,\n      \"bitter\": 1.0,\n      \"umami\": 4.6\n    },\n    \"structural\": {\n      \"body\": 6.0,\n      \"acidity_perception\": 3.5,\n      \"tannin\": 0.2,\n      \"astringency\": 0.2,\n      \"alcohol_heat\": 1.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"marine\",\n        \"group\": \"sea\",\n        \"item\": \"briny-salt\",\n        \"intensity\": 4.8\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"wine\",\n        \"item\": \"citrus\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 3.2\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"fresh-herbs\",\n        \"item\": \"parsley-green\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"dairy\",\n        \"group\": \"buttery\",\n        \"item\": \"creamy\",\n        \"intensity\": 3.6\n      }\n    ],\n    \"texture\": [\n      \"tender\",\n      \"silky broth\",\n      \"juicy\"\n    ],\n    \"finish_length\": 4.5,\n    \"complexity\": 5.8\n  },\n  \"metadata\": {\n    \"source\": \"user-provided\",\n    \"license\": \"unknown\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:dish:moules-marinieres",
                "expected": "ogs:core:dish:moules-marinieres",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3079,
            "output_tokens": 1527,
            "latency_ms": 9972.8,
            "usd": 0.000765,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:benchmark-task:dish:moules-marinieres\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"fr\",\n  \"available_languages\": [\"fr\", \"en\"],\n  \"name\": {\n    \"primary_language\": \"fr\",\n    \"labels\": [\n      {\n        \"value\": \"Moules marinières\",\n        \"language\": \"fr\",\n        \"role\": \"preferred\"\n      },\n      {\n        \"value\": \"Marinière mussels\",\n        \"language\": \"en\",\n        \"role\": \"alternate\"\n      }\n    ]\n  },\n  \"description\": {\n    \"primary_language\": \"en\",\n    \"labels\": [\n      {\n        \"value\": \"Steamed mussels in a white-wine, shallot, and parsley sauce enriched with butter.\",\n        \"language\": \"en\",\n        \"role\": \"preferred\"\n      },\n      {\n        \"value\": \"Moules cuites à la vapeur dans une sauce au vin blanc, aux échalotes et au persil, enrichie de beurre.\",\n        \"language\": \"fr\",\n        \"role\": \"alternate\"\n      }\n    ]\n  },\n  \"cuisine\": \"french\",\n  \"course\": \"main\",\n  \"service_temperature\": \"hot\",\n  \"components\": [\n    {\n      \"role\": \"main_protein\",\n      \"ingredient_ref\": \"ogs:core:ingredient:mussels\",\n      \"cooking_method\": \"steamed\",\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"umami\": 7.5,\n          \"salty\": 2.5,\n          \"sweet\": 0.5,\n          \"sour\": 0.5,\n          \"bitter\": 0.5\n        },\n        \"texture\": [\"tender\", \"juicy\"]\n      }\n    },\n    {\n      \"role\": \"sauce\",\n      \"ingredient_ref\": \"ogs:core:ingredient:white-wine\",\n      \"cooking_method\": \"braised\",\n      \"proportion\": \"significant\",\n      \"reduction_level\": 2,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sour\": 2.5,\n          \"salty\": 1.0,\n          \"umami\": 1.5,\n          \"sweet\": 1.0,\n          \"bitter\": 0.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"fruit\",\n            \"group\": \"white wine fruit\",\n            \"item\": \"citrus/stone-fruit notes\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 3.0\n          },\n          {\n            \"family\": \"savory\",\n            \"group\": \"wine-derived savoriness\",\n            \"item\": \"rounded mineral/wine character\",\n            \"intensity\": 2.5\n          }\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:shallots\",\n      \"cooking_method\": \"sauteed\",\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 2.0,\n          \"sour\": 0.5,\n          \"salty\": 0.5,\n          \"bitter\": 0.8,\n          \"umami\": 1.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"allium\",\n            \"item\": \"shallot aroma\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 4.0\n          }\n        ],\n        \"texture\": [\"soft\"]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:parsley\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.5,\n          \"sour\": 0.5,\n          \"salty\": 0.2,\n          \"bitter\": 1.0,\n          \"umami\": 0.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"green herb\",\n            \"item\": \"parsley\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 5.0\n          }\n        ],\n        \"texture\": [\"fresh\"]\n      }\n    },\n    {\n      \"role\": \"fat_component\",\n      \"ingredient_ref\": \"ogs:core:ingredient:butter\",\n      \"cooking_method\": \"emulsified\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.5,\n          \"sour\": 0.2,\n          \"salty\": 0.8,\n          \"bitter\": 0.2,\n          \"umami\": 0.8\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"dairy\",\n            \"group\": \"butter\",\n            \"item\": \"buttery richness\",\n            \"intensity\": 4.5\n          }\n        ],\n        \"texture\": [\"silky\"]\n      }\n    },\n    {\n      \"role\": \"seasoning\",\n      \"ingredient_ref\": \"ogs:core:ingredient:sea-salt\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"trace\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"salty\": 3.5,\n          \"sweet\": 0.0,\n          \"sour\": 0.0,\n          \"bitter\": 0.0,\n          \"umami\": 0.5\n        }\n      }\n    }\n  ],\n  \"overall_sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.0,\n      \"sour\": 2.0,\n      \"salty\": 3.5,\n      \"bitter\": 1.0,\n      \"umami\": 6.0\n    },\n    \"structural\": {\n      \"body\": 6.0,\n      \"acidity_perception\": 2.2,\n      \"tannin\": 0.2,\n      \"astringency\": 0.2,\n      \"alcohol_heat\": 1.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"savory\",\n        \"group\": \"sea + broth\",\n        \"item\": \"briny seafood character\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green herb\",\n        \"item\": \"parsley\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.8\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"allium\",\n        \"item\": \"shallot\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 3.8\n      },\n      {\n        \"family\": \"dairy\",\n        \"group\": \"butter\",\n        \"item\": \"buttery richness\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"white wine fruit notes\",\n        \"item\": \"citrus/stone-fruit\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 2.8\n      }\n    ],\n    \"texture\": [\"tender mussels\", \"silky wine-butter broth\"],\n    \"finish_length\": 5.5,\n    \"complexity\": 5.8\n  },\n  \"notes\": {\n    \"primary_language\": \"en\",\n    \"labels\": [\n      {\n        \"value\": \"Classic marinière: mussels steamed and served with their aromatic broth, finished with butter and parsley.\",\n        \"language\": \"en\",\n        \"role\": \"preferred\"\n      }\n    ]\n  },\n  \"metadata\": {\n    \"source\": \"user-provided\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": "ogs:benchmark-task:dish:moules-marinieres",
                "expected": "ogs:core:dish:moules-marinieres",
                "match": false
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3079,
            "output_tokens": 1783,
            "latency_ms": 10377.6,
            "usd": 0.000867,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:moules-marinieres\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": \"moules marinieres\",\n  \"cuisine\": \"french\",\n  \"course\": \"main\",\n  \"service_temperature\": \"hot\",\n  \"components\": [\n    {\n      \"role\": \"main_protein\",\n      \"ingredient_ref\": \"ogs:core:ingredient:mussels\",\n      \"cooking_method\": \"steamed\",\n      \"proportion\": \"dominant\",\n      \"maillard_index\": 0,\n      \"reduction_level\": 0,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.4,\n          \"sour\": 0.2,\n          \"salty\": 5.8,\n          \"bitter\": 0.1,\n          \"umami\": 7.2\n        },\n        \"texture\": [\n          \"tender\",\n          \"juicy\"\n        ],\n        \"aromatics\": [\n          {\n            \"family\": \"marine\",\n            \"group\": \"sea-breeze\",\n            \"item\": \"briny mussel aroma\",\n            \"intensity\": 6.4\n          }\n        ]\n      }\n    },\n    {\n      \"role\": \"sauce\",\n      \"ingredient_ref\": \"ogs:core:ingredient/white-wine-sauce-base\",\n      \"cooking_method\": \"emulsified\",\n      \"proportion\": \"significant\",\n      \"maillard_index\": 0,\n      \"reduction_level\": 2,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.8,\n          \"sour\": 1.8,\n          \"salty\": 3.2,\n          \"bitter\": 0.2,\n          \"umami\": 2.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"earth\",\n            \"group\": \"shallot-derived\",\n            \"item\": \"soft allium sweetness\",\n            \"intensity\": 3.2\n          },\n          {\n            \"family\": \"savory\",\n            \"group\": \"wine-cooked\",\n            \"item\": \"light vinous aroma\",\n            \"intensity\": 3.6\n          }\n        ],\n        \"texture\": [\n          \"silky\",\n          \"lightly glossy\"\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient/shallots\",\n      \"cooking_method\": \"sauteed\",\n      \"proportion\": \"significant\",\n      \"maillard_index\": 2,\n      \"reduction_level\": 1,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 1.6,\n          \"sour\": 0.3,\n          \"salty\": 0.6,\n          \"bitter\": 0.1,\n          \"umami\": 0.9\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"earth\",\n            \"group\": \"allium\",\n            \"item\": \"cooked shallot\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 4.8\n          }\n        ],\n        \"texture\": [\n          \"soft\"\n        ]\n      }\n    },\n    {\n      \"role\": \"fat_component\",\n      \"ingredient_ref\": \"ogs:core:ingredient/butter\",\n      \"cooking_method\": \"emulsified\",\n      \"proportion\": \"accent\",\n      \"maillard_index\": 0,\n      \"reduction_level\": 0,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.6,\n          \"sour\": 0.1,\n          \"salty\": 0.8,\n          \"bitter\": 0.1,\n          \"umami\": 0.6\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"dairy\",\n            \"group\": \"butter\",\n            \"item\": \"creamy butter note\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 3.8\n          }\n        ],\n        \"texture\": [\n          \"creamy\",\n          \"coating\"\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient/parsley\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"maillard_index\": 0,\n      \"reduction_level\": 0,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.2,\n          \"sour\": 0.2,\n          \"salty\": 0.1,\n          \"bitter\": 0.7,\n          \"umami\": 0.2\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"green\",\n            \"item\": \"fresh parsley\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 4.4\n          }\n        ],\n        \"texture\": [\n          \"fresh-herb pop\"\n        ]\n      }\n    },\n    {\n      \"role\": \"seasoning\",\n      \"ingredient_ref\": \"ogs:core:ingredient/fine-sea-salt\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"maillard_index\": 0,\n      \"reduction_level\": 0,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.0,\n          \"sour\": 0.0,\n          \"salty\": 3.6,\n          \"bitter\": 0.0,\n          \"umami\": 0.0\n        }\n      }\n    }\n  ],\n  \"overall_sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0.9,\n      \"sour\": 1.9,\n      \"salty\": 6.0,\n      \"bitter\": 0.7,\n      \"umami\": 7.0\n    },\n    \"structural\": {\n      \"body\": 6.0,\n      \"acidity_perception\": 2.2,\n      \"tannin\": 0.2,\n      \"astringency\": 0.3,\n      \"alcohol_heat\": 0.4,\n      \"effervescence\": 0.2\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"marine\",\n        \"group\": \"sea\",\n        \"item\": \"briny mussel\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 6.2\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"allium\",\n        \"item\": \"soft shallot sweetness\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 4.6\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green\",\n        \"item\": \"parsley freshness\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"dairy\",\n        \"group\": \"butter\",\n        \"item\": \"creamy butter lift\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"savory\",\n        \"group\": \"wine\",\n        \"item\": \"light vinous savor\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 3.6\n      }\n    ],\n    \"texture\": [\n      \"tender shellfish\",\n      \"juicy\",\n      \"lightly glossy broth\"\n    ],\n    \"finish_length\": 4.8,\n    \"complexity\": 5.8\n  }\n}",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "'ogs:core:ingredient/fine-sea-salt' does not match '^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$'"
            ],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:dish:moules-marinieres",
                "expected": "ogs:core:dish:moules-marinieres",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3079,
            "output_tokens": 1706,
            "latency_ms": 10143.6,
            "usd": 0.000836,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.5,
      "score_summary": {
        "mean": 0.5,
        "stdev": 0.5,
        "sem": 0.2887,
        "ci95_lo": -0.0658,
        "ci95_hi": 1.0658,
        "n": 3
      },
      "score_if_valid": 0.5,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3079.0,
        "output_tokens_mean": 1672.0,
        "latency_ms_mean": 10164.667,
        "usd_sum": 0.002468
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-006",
      "category": "structured_output",
      "capability": "schema_valid_ingredient",
      "difficulty": "medium",
      "cuisine_region": "iranian",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.ingredient",
        "vocab.ingredient_categories",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "bb4d0fcee2d1e751",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS ingredient schema (`schema/ingredient.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/ingredient.schema.json\",\n  \"title\": \"OGS Ingredient\",\n  \"description\": \"An ingredient entity with sensory profile, category, and provenance metadata.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"category\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\",\n      \"description\": \"OGS identifier in the format ogs:<namespace>:ingredient:<id>\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\",\n      \"description\": \"OGS specification version (0.1.x or 0.2.x)\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\",\n      \"description\": \"Authoritative / endonym language of this entity's human-readable fields.\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      },\n      \"description\": \"Advisory list of languages covered by this entity's LString fields.\"\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"category\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"protein\",\n        \"vegetable\",\n        \"fruit\",\n        \"grain\",\n        \"dairy\",\n        \"fat\",\n        \"herb\",\n        \"spice\",\n        \"fungus\",\n        \"legume\",\n        \"nut\",\n        \"seed\",\n        \"sweetener\",\n        \"acid\",\n        \"salt\",\n        \"other\"\n      ]\n    },\n    \"subcategory\": {\n      \"type\": \"string\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"origin\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"region\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"season\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"terroir_notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document that describes 'saffron' as an OGS ingredient. It MUST validate against the OGS ingredient schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"ingredient_name\": \"saffron\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"saffron-stigma\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/ingredient.schema.json",
        "required_fields": {
          "id": "ogs:core:ingredient:saffron-stigma",
          "ogs_version": "0.1.0",
          "category": "spice"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:saffron-stigma\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"labels\": [\n      {\n        \"value\": \"saffron\",\n        \"language\": \"en\",\n        \"role\": \"preferred\"\n      }\n    ]\n  },\n  \"category\": \"spice\",\n  \"subcategory\": \"stigma (Crocus sativus)\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.0,\n      \"sour\": 0.5,\n      \"salty\": 0.2,\n      \"bitter\": 1.8,\n      \"umami\": 0.8\n    },\n    \"structural\": {\n      \"body\": 1.0,\n      \"acidity_perception\": 0.6,\n      \"tannin\": 0.4,\n      \"astringency\": 0.3,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"spice\",\n        \"group\": \"saffron-like\",\n        \"item\": \"honeyed\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 6.8\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"flavor base\",\n        \"item\": \"toasty-sweet\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 4.6\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"petal notes\",\n        \"item\": \"dry floral\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 3.8\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"background\",\n        \"item\": \"hay-like\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 3.2\n      }\n    ],\n    \"texture\": [\n      \"threads\",\n      \"slightly resinous\"\n    ],\n    \"finish_length\": 5.0,\n    \"complexity\": 6.0\n  },\n  \"notes\": {\n    \"labels\": [\n      {\n        \"value\": \"Dried stigma threads; aroma releases strongly with gentle heat and hydration.\",\n        \"language\": \"en\",\n        \"role\": \"descriptive\"\n      }\n    ]\n  },\n  \"metadata\": {\n    \"source\": \"structured description for OGS ingredient entity\",\n    \"license\": \"unknown\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:saffron-stigma",
                "expected": "ogs:core:ingredient:saffron-stigma",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "spice",
                "expected": "spice",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2501,
            "output_tokens": 559,
            "latency_ms": 4080.0,
            "usd": 0.000349,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:saffron-stigma\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"labels\": [\n      {\n        \"value\": \"saffron\",\n        \"language\": \"en\",\n        \"role\": \"preferred\"\n      }\n    ]\n  },\n  \"category\": \"spice\",\n  \"subcategory\": \"stigma (saffron threads)\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.0,\n      \"sour\": 0.3,\n      \"salty\": 0.2,\n      \"bitter\": 2.8,\n      \"umami\": 0.8\n    },\n    \"structural\": {\n      \"body\": 1.5,\n      \"acidity_perception\": 0.4,\n      \"tannin\": 0.2,\n      \"astringency\": 0.6,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"floral\",\n        \"group\": \"honeyed\",\n        \"item\": \"stigma floral\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 5.8\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"hay-like spice\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 4.9\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"golden sweetness\",\n        \"item\": \"honey-caramel\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 2.6\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"stigma depth\",\n        \"item\": \"earthy-balsamic\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 2.8\n      }\n    ],\n    \"texture\": [\n      \"delicate threads\",\n      \"fine powder when ground\"\n    ],\n    \"finish_length\": 6.0,\n    \"complexity\": 6.5\n  },\n  \"origin\": {\n    \"region\": \"Mediterranean / Southwest Asia (commonly cultivated in Iran, Spain, and Greece)\",\n    \"season\": \"Autumn (harvest period)\"\n  },\n  \"notes\": {\n    \"labels\": [\n      {\n        \"value\": \"Aromatic coloring spice derived from the dried stigmas of Crocus sativus; typically used in small amounts for aroma and color.\",\n        \"language\": \"en\",\n        \"role\": \"descriptive\"\n      }\n    ]\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"OGS ingredient profile (sensory approximation)\",\n    \"license\": \"unknown\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:saffron-stigma",
                "expected": "ogs:core:ingredient:saffron-stigma",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "spice",
                "expected": "spice",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2501,
            "output_tokens": 658,
            "latency_ms": 4888.4,
            "usd": 0.000388,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:saffron-stigma\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"en\": \"saffron\",\n    \"fr\": \"safran\"\n  },\n  \"category\": \"spice\",\n  \"subcategory\": \"stigma\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0.8,\n      \"sour\": 0.2,\n      \"salty\": 0.1,\n      \"bitter\": 1.6,\n      \"umami\": 0.4\n    },\n    \"structural\": {\n      \"body\": 1.3,\n      \"acidity_perception\": 0.4,\n      \"tannin\": 0.6,\n      \"astringency\": 0.7,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"spice\",\n        \"group\": \"saffron tones\",\n        \"item\": \"hay-like\",\n        \"intensity\": 5.2,\n        \"qualifier\": \"dried\"\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"floral nuance\",\n        \"item\": \"honeyed floral\",\n        \"intensity\": 3.8,\n        \"qualifier\": \"cooked\"\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"sweet spice\",\n        \"item\": \"caramelized honey\",\n        \"intensity\": 2.6,\n        \"qualifier\": \"candied\"\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"earthy base\",\n        \"item\": \"strawy\",\n        \"intensity\": 2.3,\n        \"qualifier\": \"dried\"\n      }\n    ],\n    \"texture\": [\n      \"threads\",\n      \"powdery when ground\"\n    ],\n    \"finish_length\": 4.2,\n    \"complexity\": 6.1\n  },\n  \"origin\": {\n    \"region\": \"Southwest Asia (commonly Iran, Afghanistan, Spain, Greece)\",\n    \"season\": \"Harvest typically in autumn\",\n    \"terroir_notes\": \"Flavor intensity varies with cultivar and drying method\"\n  },\n  \"notes\": {\n    \"en\": \"Saffron is valued for its intense aroma and characteristic golden color. Used as threads or ground.\",\n    \"es\": \"El azafrán se valora por su aroma intenso y su característico color dorado. Se usa en hebras o molido.\"\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"knowledge-based profile\",\n    \"license\": \"unknown\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:saffron-stigma",
                "expected": "ogs:core:ingredient:saffron-stigma",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "spice",
                "expected": "spice",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2501,
            "output_tokens": 640,
            "latency_ms": 4662.0,
            "usd": 0.000381,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 2501.0,
        "output_tokens_mean": 619.0,
        "latency_ms_mean": 4543.467,
        "usd_sum": 0.001118
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-007",
      "category": "structured_output",
      "capability": "schema_valid_ingredient",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.ingredient",
        "vocab.ingredient_categories",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "0bc2e49ee6c1093e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS ingredient schema (`schema/ingredient.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/ingredient.schema.json\",\n  \"title\": \"OGS Ingredient\",\n  \"description\": \"An ingredient entity with sensory profile, category, and provenance metadata.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"category\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\",\n      \"description\": \"OGS identifier in the format ogs:<namespace>:ingredient:<id>\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\",\n      \"description\": \"OGS specification version (0.1.x or 0.2.x)\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\",\n      \"description\": \"Authoritative / endonym language of this entity's human-readable fields.\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      },\n      \"description\": \"Advisory list of languages covered by this entity's LString fields.\"\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"category\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"protein\",\n        \"vegetable\",\n        \"fruit\",\n        \"grain\",\n        \"dairy\",\n        \"fat\",\n        \"herb\",\n        \"spice\",\n        \"fungus\",\n        \"legume\",\n        \"nut\",\n        \"seed\",\n        \"sweetener\",\n        \"acid\",\n        \"salt\",\n        \"other\"\n      ]\n    },\n    \"subcategory\": {\n      \"type\": \"string\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"origin\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"region\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"season\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"terroir_notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document that describes 'miso (red)' as an OGS ingredient. It MUST validate against the OGS ingredient schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"ingredient_name\": \"miso (red)\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"aka-miso\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/ingredient.schema.json",
        "required_fields": {
          "id": "ogs:core:ingredient:aka-miso",
          "ogs_version": "0.1.0",
          "category": "other"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:aka-miso\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"en\": \"miso (red)\"\n  },\n  \"category\": \"protein\",\n  \"subcategory\": \"fermented soybean paste\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.0,\n      \"sour\": 1.0,\n      \"salty\": 7.5,\n      \"bitter\": 1.5,\n      \"umami\": 8.5\n    },\n    \"structural\": {\n      \"body\": 4.5,\n      \"acidity_perception\": 1.5,\n      \"tannin\": 0.0,\n      \"astringency\": 0.5,\n      \"alcohol_heat\": 0.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"savory\",\n        \"group\": \"fermented\",\n        \"item\": \"soy sauce-like aroma\",\n        \"intensity\": 7.0,\n        \"qualifier\": \"fermented\"\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"fermented\",\n        \"item\": \"mushroomy/koji depth\",\n        \"intensity\": 6.5,\n        \"qualifier\": \"fermented\"\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"aged\",\n        \"item\": \"toasty/aged notes\",\n        \"intensity\": 4.5,\n        \"qualifier\": \"cooked\"\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"caramel-like\",\n        \"item\": \"deep malty sweetness\",\n        \"intensity\": 3.5,\n        \"qualifier\": \"cooked\"\n      }\n    ],\n    \"texture\": [\n      \"thick\",\n      \"paste-like\"\n    ],\n    \"finish_length\": 4.0,\n    \"complexity\": 6.5\n  },\n  \"origin\": {\n    \"region\": \"Japan\",\n    \"season\": \"year-round\"\n  },\n  \"notes\": {\n    \"en\": \"Red miso is a fermented soybean paste (often with barley or rice), typically darker and more robust than white miso, with higher perceived sweetness and umami alongside noticeable salt.\"\n  },\n  \"metadata\": {\n    \"source\": \"general sensory characterization of red miso\",\n    \"license\": \"unspecified\",\n    \"updated\": \"2026-04-18T00:00:00Z\"\n  }\n}",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:aka-miso",
                "expected": "ogs:core:ingredient:aka-miso",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "protein",
                "expected": "other",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 2501,
            "output_tokens": 575,
            "latency_ms": 4191.7,
            "usd": 0.000355,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:aka-miso\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"miso (red)\",\n  \"category\": \"protein\",\n  \"subcategory\": \"fermented soybean paste\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.8,\n      \"sour\": 1.2,\n      \"salty\": 7.8,\n      \"bitter\": 1.0,\n      \"umami\": 8.6\n    },\n    \"structural\": {\n      \"body\": 6.2,\n      \"acidity_perception\": 1.3,\n      \"tannin\": 0.4,\n      \"astringency\": 0.6,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"savory\",\n        \"group\": \"fermented\",\n        \"item\": \"soy umami\",\n        \"qualifier\": \"fermented\",\n        \"intensity\": 8.2\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"fermented\",\n        \"item\": \"mellow earthy notes\",\n        \"intensity\": 4.6\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"toasty\",\n        \"item\": \"roasted/toasty aroma\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 3.9\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"caramelized\",\n        \"item\": \"light sweetness from malt\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 2.6\n      }\n    ],\n    \"texture\": [\n      \"thick paste\",\n      \"smooth\"\n    ],\n    \"finish_length\": 4.2,\n    \"complexity\": 6.7\n  },\n  \"notes\": \"Red miso is a fermented soybean paste with pronounced umami and salty savoriness, typically with warm, toasty, and earthy aromatic notes.\",\n  \"metadata\": {\n    \"source\": \"OGS task generation\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:aka-miso",
                "expected": "ogs:core:ingredient:aka-miso",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "protein",
                "expected": "other",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 2501,
            "output_tokens": 486,
            "latency_ms": 3502.6,
            "usd": 0.000319,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:aka-miso\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"en\": \"miso (red)\"\n  },\n  \"description\": {\n    \"en\": \"A fermented Japanese soybean paste with a dark reddish-brown color, typically aged longer and with a savory, umami-forward flavor profile.\"\n  },\n  \"category\": \"protein\",\n  \"subcategory\": \"fermented soybean paste\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.5,\n      \"sour\": 1.2,\n      \"salty\": 7.5,\n      \"bitter\": 1.8,\n      \"umami\": 8.8\n    },\n    \"structural\": {\n      \"body\": 6.3,\n      \"acidity_perception\": 2.0,\n      \"tannin\": 0.6,\n      \"astringency\": 0.8,\n      \"alcohol_heat\": 0.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"savory\",\n        \"group\": \"fermented\",\n        \"item\": \"soy/umami notes\",\n        \"qualifier\": \"fermented\",\n        \"intensity\": 8.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"toasty\",\n        \"item\": \"aged, roasted aroma\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 5.5\n      }\n    ],\n    \"texture\": [\n      \"thick\",\n      \"paste-like\"\n    ],\n    \"finish_length\": 5.0,\n    \"complexity\": 6.7\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"Japan\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"Flavor intensity varies with aging time and koji composition.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Commonly used for miso soup and marinades; red miso is generally more robust and less sweet than lighter miso styles.\"\n  },\n  \"metadata\": {\n    \"source\": \"general culinary knowledge\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:aka-miso",
                "expected": "ogs:core:ingredient:aka-miso",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "protein",
                "expected": "other",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 2501,
            "output_tokens": 505,
            "latency_ms": 3541.6,
            "usd": 0.000327,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.5,
      "score_summary": {
        "mean": 0.5,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.5,
        "ci95_hi": 0.5,
        "n": 3
      },
      "score_if_valid": 0.5,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 2501.0,
        "output_tokens_mean": 522.0,
        "latency_ms_mean": 3745.3,
        "usd_sum": 0.001001
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-008",
      "category": "structured_output",
      "capability": "schema_valid_ingredient",
      "difficulty": "hard",
      "cuisine_region": "turkish",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.ingredient",
        "vocab.ingredient_categories",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "791ac1beebd95233",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS ingredient schema (`schema/ingredient.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/ingredient.schema.json\",\n  \"title\": \"OGS Ingredient\",\n  \"description\": \"An ingredient entity with sensory profile, category, and provenance metadata.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"category\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\",\n      \"description\": \"OGS identifier in the format ogs:<namespace>:ingredient:<id>\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\",\n      \"description\": \"OGS specification version (0.1.x or 0.2.x)\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\",\n      \"description\": \"Authoritative / endonym language of this entity's human-readable fields.\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      },\n      \"description\": \"Advisory list of languages covered by this entity's LString fields.\"\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"category\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"protein\",\n        \"vegetable\",\n        \"fruit\",\n        \"grain\",\n        \"dairy\",\n        \"fat\",\n        \"herb\",\n        \"spice\",\n        \"fungus\",\n        \"legume\",\n        \"nut\",\n        \"seed\",\n        \"sweetener\",\n        \"acid\",\n        \"salt\",\n        \"other\"\n      ]\n    },\n    \"subcategory\": {\n      \"type\": \"string\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"origin\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"region\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"season\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"terroir_notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document that describes 'Urfa chili' as an OGS ingredient. It MUST validate against the OGS ingredient schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"ingredient_name\": \"Urfa chili\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"urfa-biber\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/ingredient.schema.json",
        "required_fields": {
          "id": "ogs:core:ingredient:urfa-biber",
          "ogs_version": "0.1.0",
          "category": "spice"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:urfa-biber\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"en\": \"Urfa chili\"\n  },\n  \"category\": \"spice\",\n  \"subcategory\": \"chili pepper (dried)\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.5,\n      \"sour\": 0.2,\n      \"salty\": 0,\n      \"bitter\": 2.2,\n      \"umami\": 0.8\n    },\n    \"structural\": {\n      \"body\": 2.5,\n      \"acidity_perception\": 0.3,\n      \"tannin\": 0.6,\n      \"astringency\": 1.2,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"spice\",\n        \"group\": \"pepper notes\",\n        \"item\": \"smoky, roasted chili\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"terroir-like notes\",\n        \"item\": \"cooked soil/umbral warmth\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 3.8\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"plant-derived\",\n        \"item\": \"dried herb edge\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 2.7\n      }\n    ],\n    \"texture\": [\n      \"fine ground\",\n      \"powdery\"\n    ],\n    \"finish_length\": 3.5,\n    \"complexity\": 4.6\n  },\n  \"origin\": {\n    \"region\": \"Urfa (Şanlıurfa), southeastern Türkiye\",\n    \"season\": \"late summer to early autumn\"\n  },\n  \"notes\": \"Urfa chili (biber) is a dried, mild-to-medium hot chili commonly used for its smoky-roasted aroma and deep savory warmth.\",\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"general culinary knowledge\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:urfa-biber",
                "expected": "ogs:core:ingredient:urfa-biber",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "spice",
                "expected": "spice",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2500,
            "output_tokens": 535,
            "latency_ms": 3738.0,
            "usd": 0.000339,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:urfa-biber\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\n    \"en\",\n    \"tr\"\n  ],\n  \"name\": {\n    \"labels\": [\n      {\n        \"value\": \"Urfa chili\",\n        \"language\": \"en\",\n        \"role\": \"preferred\"\n      },\n      {\n        \"value\": \"Urfa biber\",\n        \"language\": \"tr\",\n        \"role\": \"preferred\"\n      }\n    ]\n  },\n  \"description\": {\n    \"labels\": [\n      {\n        \"value\": \"Smoked, dried Turkish chili variety (often labeled Urfa biber) used for its fruity, smoky heat.\",\n        \"language\": \"en\",\n        \"role\": \"descriptive\"\n      },\n      {\n        \"value\": \"Dumanlı ve kurutulmuş Türk biber çeşidi (genellikle Urfa biber olarak bilinir); meyvemsi ve dumanlı bir sıcaklık verir.\",\n        \"language\": \"tr\",\n        \"role\": \"descriptive\"\n      }\n    ]\n  },\n  \"category\": \"spice\",\n  \"subcategory\": \"chili pepper\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.5,\n      \"sour\": 0.2,\n      \"salty\": 0,\n      \"bitter\": 1.0,\n      \"umami\": 0.5\n    },\n    \"structural\": {\n      \"body\": 2.5,\n      \"acidity_perception\": 0.3,\n      \"tannin\": 0.4,\n      \"astringency\": 0.7,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"chili\",\n        \"item\": \"dried pepper fruitiness\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"chili\",\n        \"item\": \"warming spice notes\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"smoke\",\n        \"item\": \"smoky char\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"pepper\",\n        \"item\": \"earthy dried pod\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 4.0\n      }\n    ],\n    \"texture\": [\n      \"dried\",\n      \"fine ground\"\n    ],\n    \"finish_length\": 4.5,\n    \"complexity\": 6.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"labels\": [\n        {\n          \"value\": \"Şanlıurfa, Turkey\",\n          \"language\": \"en\",\n          \"role\": \"preferred\"\n        },\n        {\n          \"value\": \"Türkiye, Şanlıurfa\",\n          \"language\": \"tr\",\n          \"role\": \"preferred\"\n        }\n      ]\n    },\n    \"season\": {\n      \"labels\": [\n        {\n          \"value\": \"late summer to early autumn (typically dried/preserved thereafter)\",\n          \"language\": \"en\",\n          \"role\": \"descriptive\"\n        }\n      ]\n    }\n  },\n  \"notes\": {\n    \"labels\": [\n      {\n        \"value\": \"Often used as a smoky, fruity chili seasoning; pungency level varies by preparation and grind.\",\n        \"language\": \"en\",\n        \"role\": \"descriptive\"\n      }\n    ]\n  },\n  \"metadata\": {\n    \"source\": \"knowledge-based sensory description\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:urfa-biber",
                "expected": "ogs:core:ingredient:urfa-biber",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "spice",
                "expected": "spice",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2500,
            "output_tokens": 843,
            "latency_ms": 5619.3,
            "usd": 0.000462,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:urfa-biber\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\n    \"en\",\n    \"tr\"\n  ],\n  \"name\": {\n    \"en\": \"Urfa chili\",\n    \"tr\": \"Urfa biberi\"\n  },\n  \"description\": {\n    \"en\": \"A sun-dried Turkish chili pepper known for its smoky, fruity, and gently spicy flavor profile.\",\n    \"tr\": \"Güneşte kurutulmuş, dumanlı ve meyvemsi aromalara sahip bir Türk biberi.\"\n  },\n  \"category\": \"spice\",\n  \"subcategory\": \"chili pepper (dried)\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.5,\n      \"sour\": 0.2,\n      \"salty\": 0.0,\n      \"bitter\": 2.2,\n      \"umami\": 0.3\n    },\n    \"structural\": {\n      \"body\": 1.8,\n      \"acidity_perception\": 0.3,\n      \"tannin\": 0.8,\n      \"astringency\": 1.0,\n      \"alcohol_heat\": 0.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"spice\",\n        \"group\": \"chili\",\n        \"item\": \"smoky\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"dried pepper\",\n        \"item\": \"raisin-like / fruity\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 4.8\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"chili\",\n        \"item\": \"earthy\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 3.2\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"chili\",\n        \"item\": \"herbal green note\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 1.6\n      }\n    ],\n    \"texture\": [\n      \"powdery\",\n      \"dry\"\n    ],\n    \"finish_length\": 5.0,\n    \"complexity\": 6.2\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"Şanlıurfa, Turkey\",\n      \"tr\": \"Şanlıurfa, Türkiye\"\n    },\n    \"season\": {\n      \"en\": \"summer to early autumn harvest (then dried)\",\n      \"tr\": \"yazdan erken sonbahara hasat (sonra kurutma)\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"Dried in warm, dry conditions that amplify smoky and fruit-like notes.\",\n      \"tr\": \"Sıcak ve kuru koşullarda kurutma; dumanlı ve meyvemsi notaları belirginleştirir.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Often used in Turkish cuisine as ground dried pepper for kebabs, stews, and grilled dishes; heat is typically moderate with a rounded flavor.\",\n    \"tr\": \"Türk mutfağında genellikle öğütülerek kebap, güveç ve ızgara yemeklerde kullanılır; acılık genellikle orta düzeydedir ve lezzeti yuvarlaktır.\"\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"general culinary sensory knowledge\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:urfa-biber",
                "expected": "ogs:core:ingredient:urfa-biber",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "spice",
                "expected": "spice",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2500,
            "output_tokens": 833,
            "latency_ms": 5699.3,
            "usd": 0.000458,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 2500.0,
        "output_tokens_mean": 737.0,
        "latency_ms_mean": 5018.867,
        "usd_sum": 0.001259
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-009",
      "category": "structured_output",
      "capability": "schema_valid_ingredient",
      "difficulty": "easy",
      "cuisine_region": "american",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.ingredient",
        "vocab.ingredient_categories",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "a6a51cef64e43626",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS ingredient schema (`schema/ingredient.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/ingredient.schema.json\",\n  \"title\": \"OGS Ingredient\",\n  \"description\": \"An ingredient entity with sensory profile, category, and provenance metadata.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"category\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\",\n      \"description\": \"OGS identifier in the format ogs:<namespace>:ingredient:<id>\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\",\n      \"description\": \"OGS specification version (0.1.x or 0.2.x)\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\",\n      \"description\": \"Authoritative / endonym language of this entity's human-readable fields.\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      },\n      \"description\": \"Advisory list of languages covered by this entity's LString fields.\"\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"category\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"protein\",\n        \"vegetable\",\n        \"fruit\",\n        \"grain\",\n        \"dairy\",\n        \"fat\",\n        \"herb\",\n        \"spice\",\n        \"fungus\",\n        \"legume\",\n        \"nut\",\n        \"seed\",\n        \"sweetener\",\n        \"acid\",\n        \"salt\",\n        \"other\"\n      ]\n    },\n    \"subcategory\": {\n      \"type\": \"string\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"origin\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"region\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"season\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"terroir_notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document that describes 'Meyer lemon' as an OGS ingredient. It MUST validate against the OGS ingredient schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"ingredient_name\": \"Meyer lemon\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"meyer-lemon\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/ingredient.schema.json",
        "required_fields": {
          "id": "ogs:core:ingredient:meyer-lemon",
          "ogs_version": "0.1.0",
          "category": "fruit"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:meyer-lemon\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"en\": \"Meyer lemon\"\n  },\n  \"category\": \"fruit\",\n  \"subcategory\": \"citrus\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 3.5,\n      \"sour\": 7.5,\n      \"salty\": 0,\n      \"bitter\": 1.5,\n      \"umami\": 0\n    },\n    \"structural\": {\n      \"body\": 2.5,\n      \"acidity_perception\": 7.5,\n      \"tannin\": 0.3,\n      \"astringency\": 0.4,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon-like zest\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 7.2\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"citrus blossom\",\n        \"item\": \"soft floral lift\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 3.8\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"citrus peel\",\n        \"item\": \"light aromatic spice\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 1.8\n      }\n    ],\n    \"texture\": [\n      \"juicy\",\n      \"tender vesicles\"\n    ],\n    \"finish_length\": 4,\n    \"complexity\": 4.8\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"Originally introduced from China via Western horticulture\"\n    },\n    \"season\": {\n      \"en\": \"Winter to early spring (typical availability varies by region)\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"Flavor is influenced by cultivar and growing conditions; Meyer lemons are often perceived as sweeter and less harshly acidic than standard lemons.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Often described as having a slightly sweeter, less sharp acidity profile with fragrant, aromatic peel.\"\n  },\n  \"metadata\": {\n    \"source\": \"General culinary knowledge; no specific dataset provided.\",\n    \"license\": \"unknown\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:meyer-lemon",
                "expected": "ogs:core:ingredient:meyer-lemon",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "fruit",
                "expected": "fruit",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2500,
            "output_tokens": 546,
            "latency_ms": 4003.9,
            "usd": 0.000343,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:meyer-lemon\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"labels\": [\n      {\n        \"value\": \"Meyer lemon\",\n        \"language\": \"en\",\n        \"role\": \"preferred\"\n      }\n    ]\n  },\n  \"category\": \"fruit\",\n  \"subcategory\": \"citrus\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 3.0,\n      \"sour\": 7.5,\n      \"salty\": 0,\n      \"bitter\": 1.5,\n      \"umami\": 0\n    },\n    \"structural\": {\n      \"body\": 2.0,\n      \"acidity_perception\": 8.0,\n      \"tannin\": 0,\n      \"astringency\": 0.5,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemony\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"citrus blossom\",\n        \"item\": \"petal-like notes\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"zest oils\",\n        \"item\": \"zesty peel\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 6.0\n      }\n    ],\n    \"texture\": [\n      \"juicy\",\n      \"high-acid pulp\"\n    ],\n    \"finish_length\": 4.5,\n    \"complexity\": 5.5\n  },\n  \"origin\": {\n    \"region\": {\n      \"labels\": [\n        {\n          \"value\": \"China (historical origin) and later cultivated in North America\",\n          \"language\": \"en\",\n          \"role\": \"descriptive\"\n        }\n      ]\n    },\n    \"season\": {\n      \"labels\": [\n        {\n          \"value\": \"winter to early spring (commonly available seasonally)\",\n          \"language\": \"en\",\n          \"role\": \"descriptive\"\n        }\n      ]\n    }\n  },\n  \"notes\": {\n    \"labels\": [\n      {\n        \"value\": \"A hybrid lemon variety known for a thinner, more aromatic peel and a less sharp acidity than typical lemons.\",\n        \"language\": \"en\",\n        \"role\": \"descriptive\"\n      }\n    ]\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"general culinary knowledge\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:meyer-lemon",
                "expected": "ogs:core:ingredient:meyer-lemon",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "fruit",
                "expected": "fruit",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2500,
            "output_tokens": 648,
            "latency_ms": 4559.8,
            "usd": 0.000384,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:meyer-lemon\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"labels\": [\n      {\n        \"value\": \"Meyer lemon\",\n        \"language\": \"en\",\n        \"role\": \"preferred\"\n      }\n    ]\n  },\n  \"category\": \"fruit\",\n  \"subcategory\": \"citrus fruit\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 3.2,\n      \"sour\": 7.8,\n      \"salty\": 0,\n      \"bitter\": 1.5,\n      \"umami\": 0\n    },\n    \"structural\": {\n      \"body\": 1.8,\n      \"acidity_perception\": 8.2,\n      \"tannin\": 0,\n      \"astringency\": 0.4,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon\",\n        \"intensity\": 8.0,\n        \"qualifier\": \"fresh\"\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"blossom\",\n        \"item\": \"citrus blossom\",\n        \"intensity\": 3.2,\n        \"qualifier\": \"fresh\"\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"zest\",\n        \"item\": \"lemon zest\",\n        \"intensity\": 6.8,\n        \"qualifier\": \"zest\"\n      }\n    ],\n    \"texture\": [\n      \"juicy\"\n    ],\n    \"finish_length\": 3.5,\n    \"complexity\": 4.0\n  },\n  \"notes\": {\n    \"labels\": [\n      {\n        \"value\": \"Typically sweeter and less acidic than common lemons; prized for aromatic zest and juice.\",\n        \"language\": \"en\",\n        \"role\": \"descriptive\"\n      }\n    ]\n  },\n  \"metadata\": {\n    \"source\": \"OGS task generation\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:meyer-lemon",
                "expected": "ogs:core:ingredient:meyer-lemon",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "fruit",
                "expected": "fruit",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2500,
            "output_tokens": 490,
            "latency_ms": 3602.8,
            "usd": 0.000321,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 2500.0,
        "output_tokens_mean": 561.333,
        "latency_ms_mean": 4055.5,
        "usd_sum": 0.001048
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-010",
      "category": "structured_output",
      "capability": "schema_valid_ingredient",
      "difficulty": "hard",
      "cuisine_region": "mexican",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.ingredient",
        "vocab.ingredient_categories",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "6d4d6d3cc23f3283",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS ingredient schema (`schema/ingredient.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/ingredient.schema.json\",\n  \"title\": \"OGS Ingredient\",\n  \"description\": \"An ingredient entity with sensory profile, category, and provenance metadata.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"category\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\",\n      \"description\": \"OGS identifier in the format ogs:<namespace>:ingredient:<id>\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\",\n      \"description\": \"OGS specification version (0.1.x or 0.2.x)\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\",\n      \"description\": \"Authoritative / endonym language of this entity's human-readable fields.\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      },\n      \"description\": \"Advisory list of languages covered by this entity's LString fields.\"\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"category\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"protein\",\n        \"vegetable\",\n        \"fruit\",\n        \"grain\",\n        \"dairy\",\n        \"fat\",\n        \"herb\",\n        \"spice\",\n        \"fungus\",\n        \"legume\",\n        \"nut\",\n        \"seed\",\n        \"sweetener\",\n        \"acid\",\n        \"salt\",\n        \"other\"\n      ]\n    },\n    \"subcategory\": {\n      \"type\": \"string\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"origin\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"region\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"season\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"terroir_notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document that describes 'wild huitlacoche' as an OGS ingredient. It MUST validate against the OGS ingredient schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"ingredient_name\": \"wild huitlacoche\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"huitlacoche\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/ingredient.schema.json",
        "required_fields": {
          "id": "ogs:core:ingredient:huitlacoche",
          "ogs_version": "0.1.0",
          "category": "fungus"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:huitlacoche\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"es-MX\",\n  \"name\": {\n    \"labels\": [\n      {\n        \"value\": \"huitlacoche (hongo del maíz), silvestre\",\n        \"language\": \"es-MX\",\n        \"role\": \"preferred\"\n      }\n    ]\n  },\n  \"description\": {\n    \"labels\": [\n      {\n        \"value\": \"Hongo comestible que parasita el maíz; en su forma silvestre se usa como ingrediente culinario (p. ej., en tacos y salsas).\",\n        \"language\": \"es-MX\",\n        \"role\": \"descriptive\"\n      }\n    ]\n  },\n  \"category\": \"fungus\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0.3,\n      \"sour\": 0.2,\n      \"salty\": 1.5,\n      \"bitter\": 2.0,\n      \"umami\": 7.5\n    },\n    \"structural\": {\n      \"body\": 4.8,\n      \"acidity_perception\": 1.0,\n      \"tannin\": 0.5,\n      \"astringency\": 1.2,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"earth\",\n        \"group\": \"fungal/terroso\",\n        \"item\": \"tierra húmeda\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"savory\",\n        \"group\": \"umami\",\n        \"item\": \"caldo/cecina\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"madera\",\n        \"item\": \"hongo tostado\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"maíz\",\n        \"item\": \"maíz asado\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 3.8\n      }\n    ],\n    \"texture\": [\n      \"carnoso\",\n      \"jugoso\"\n    ],\n    \"finish_length\": 4.2,\n    \"complexity\": 6.8\n  },\n  \"origin\": {\n    \"region\": {\n      \"labels\": [\n        {\n          \"value\": \"Mesoamérica (principalmente México)\",\n          \"language\": \"es-MX\",\n          \"role\": \"preferred\"\n        }\n      ]\n    },\n    \"terroir_notes\": {\n      \"labels\": [\n        {\n          \"value\": \"Frecuente en regiones donde el maíz se cultiva y el hongo se desarrolla en condiciones locales.\",\n          \"language\": \"es-MX\",\n          \"role\": \"descriptive\"\n        }\n      ]\n    }\n  },\n  \"notes\": {\n    \"labels\": [\n      {\n        \"value\": \"Perfil orientativo; el sabor y el aroma cambian según madurez del hongo y método de cocción (sofreír, guisar, asar).\",\n        \"language\": \"es-MX\",\n        \"role\": \"descriptive\"\n      }\n    ]\n  },\n  \"metadata\": {\n    \"source\": \"Conocimiento gastronómico general sobre huitlacoche silvestre\",\n    \"license\": \"unspecified\",\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:huitlacoche",
                "expected": "ogs:core:ingredient:huitlacoche",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "fungus",
                "expected": "fungus",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2502,
            "output_tokens": 825,
            "latency_ms": 5767.0,
            "usd": 0.000455,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:huitlacoche\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"es-MX\",\n  \"name\": {\n    \"labels\": [\n      {\n        \"value\": \"huitlacoche\",\n        \"language\": \"es-MX\",\n        \"role\": \"preferred\"\n      }\n    ]\n  },\n  \"description\": {\n    \"labels\": [\n      {\n        \"value\": \"Mazorca de maíz infectada por el hongo Ustilago maydis; comúnmente llamada huitlacoche. Variante silvestre.\",\n        \"language\": \"es-MX\",\n        \"role\": \"descriptive\"\n      }\n    ]\n  },\n  \"category\": \"fungus\",\n  \"subcategory\": \"maíz-infectado (Ustilago maydis)\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.0,\n      \"sour\": 0.5,\n      \"salty\": 2.0,\n      \"bitter\": 1.5,\n      \"umami\": 7.8\n    },\n    \"structural\": {\n      \"body\": 4.5,\n      \"acidity_perception\": 1.0,\n      \"tannin\": 0.2,\n      \"astringency\": 0.4,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"earth\",\n        \"group\": \"terroso\",\n        \"item\": \"moho/tierrra húmeda\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 6.3\n      },\n      {\n        \"family\": \"savory\",\n        \"group\": \"umami\",\n        \"item\": \"champiñón/vegetal cocido\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"maíz\",\n        \"item\": \"mazorca cocida\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"especiado sutil\",\n        \"item\": \"pimienta negra\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 2.2\n      }\n    ],\n    \"texture\": [\n      \"carnoso\",\n      \"jugoso\",\n      \"tierno\"\n    ],\n    \"finish_length\": 4.0,\n    \"complexity\": 6.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"labels\": [\n        {\n          \"value\": \"Mesoamérica (principalmente México)\",\n          \"language\": \"es-MX\",\n          \"role\": \"descriptive\"\n        }\n      ]\n    },\n    \"season\": {\n      \"labels\": [\n        {\n          \"value\": \"temporada de lluvias (variable por región)\",\n          \"language\": \"es-MX\",\n          \"role\": \"descriptive\"\n        }\n      ]\n    },\n    \"terroir_notes\": {\n      \"labels\": [\n        {\n          \"value\": \"Se desarrolla naturalmente en mazorcas de maíz; la intensidad puede variar según el clima y el estado del maíz huésped.\",\n          \"language\": \"es-MX\",\n          \"role\": \"descriptive\"\n        }\n      ]\n    }\n  },\n  \"notes\": {\n    \"labels\": [\n      {\n        \"value\": \"“Wild huitlacoche” se refiere a huitlacoche recolectada de forma no cultivada (crecimiento natural).\",\n        \"language\": \"es-MX\",\n        \"role\": \"descriptive\"\n      }\n    ]\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"general culinary knowledge\",\n    \"license\": \"unknown\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:huitlacoche",
                "expected": "ogs:core:ingredient:huitlacoche",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "fungus",
                "expected": "fungus",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2502,
            "output_tokens": 876,
            "latency_ms": 6058.0,
            "usd": 0.000476,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:huitlacoche\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": {\n    \"en\": \"wild huitlacoche\"\n  },\n  \"primary_language\": \"en\",\n  \"category\": \"fungus\",\n  \"subcategory\": \"corn smut (wild)\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0.5,\n      \"sour\": 0,\n      \"salty\": 1.0,\n      \"bitter\": 0.8,\n      \"umami\": 6.5\n    },\n    \"structural\": {\n      \"body\": 4.5,\n      \"acidity_perception\": 0.5,\n      \"tannin\": 0,\n      \"astringency\": 0,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"earth\",\n        \"group\": \"fungal\",\n        \"item\": \"earthy/woodsy notes\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"savory\",\n        \"group\": \"umami\",\n        \"item\": \"roasted corn & mushroom-like savoriness\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"corn\",\n        \"item\": \"green corn husk character\",\n        \"intensity\": 4.5,\n        \"qualifier\": \"cooked\"\n      }\n    ],\n    \"texture\": [\n      \"meaty\",\n      \"slightly spongy\",\n      \"tender when cooked\"\n    ],\n    \"finish_length\": 4.0,\n    \"complexity\": 6.5\n  },\n  \"notes\": {\n    \"en\": \"Huitlacoche is the corn smut fungus (Ustilago maydis). In culinary use, it is typically cooked until savory and tender, often showing strong umami and earthy, mushroom-like character.\"\n  },\n  \"metadata\": {\n    \"source\": \"general culinary knowledge; sensory profile is an approximate descriptor\",\n    \"license\": \"unknown\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:huitlacoche",
                "expected": "ogs:core:ingredient:huitlacoche",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "fungus",
                "expected": "fungus",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2502,
            "output_tokens": 479,
            "latency_ms": 3571.3,
            "usd": 0.000317,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 2502.0,
        "output_tokens_mean": 726.667,
        "latency_ms_mean": 5132.1,
        "usd_sum": 0.001248
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-011",
      "category": "structured_output",
      "capability": "schema_valid_beverage",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.beverage",
        "vocab.beverage_types",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "6615664480900680",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS beverage schema (`schema/beverage.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/beverage.schema.json\",\n  \"title\": \"OGS Beverage\",\n  \"description\": \"A beverage entity with type, structural attributes, sensory profile, and provenance.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"type\",\n    \"sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"type\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"wine\",\n        \"beer\",\n        \"spirit\",\n        \"cider\",\n        \"sake\",\n        \"other\"\n      ]\n    },\n    \"subtype\": {\n      \"type\": \"string\"\n    },\n    \"grape_varieties\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"type\": \"string\"\n      }\n    },\n    \"vintage\": {\n      \"type\": \"integer\"\n    },\n    \"region\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"appellation\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"producer\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"abv\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"residual_sugar\": {\n      \"type\": \"number\",\n      \"minimum\": 0\n    },\n    \"serve_temperature_c\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"min\": {\n          \"type\": \"number\"\n        },\n        \"max\": {\n          \"type\": \"number\"\n        }\n      },\n      \"additionalProperties\": false,\n      \"description\": \"Recommended serving temperature range in degrees Celsius (SI canonical).\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document describing 'Barolo' as an OGS beverage. It MUST validate against the OGS beverage schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"beverage_name\": \"Barolo\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"barolo-docg\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/beverage.schema.json",
        "required_fields": {
          "id": "ogs:core:beverage:barolo-docg",
          "ogs_version": "0.1.0",
          "type": "wine"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:barolo-docg\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Barolo\",\n  \"type\": \"wine\",\n  \"region\": \"Piedmont\",\n  \"appellation\": \"Barolo DOCG\",\n  \"grape_varieties\": [\n    \"Nebbiolo\"\n  ],\n  \"serve_temperature_c\": {\n    \"min\": 14,\n    \"max\": 18\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0.5,\n      \"sour\": 3.5,\n      \"salty\": 0.2,\n      \"bitter\": 2.8,\n      \"umami\": 1.8\n    },\n    \"structural\": {\n      \"body\": 7.5,\n      \"acidity_perception\": 6.5,\n      \"tannin\": 8.5,\n      \"astringency\": 7.8,\n      \"alcohol_heat\": 4.5,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"red fruit\",\n        \"item\": \"cherry\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.8\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"red fruit\",\n        \"item\": \"red plum\",\n        \"intensity\": 4.2\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"violet\",\n        \"item\": \"violet\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spices\",\n        \"item\": \"rose pepper\",\n        \"intensity\": 3.6\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"forest floor\",\n        \"item\": \"truffle\",\n        \"intensity\": 3.2\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"barrel influence\",\n        \"item\": \"cedar\",\n        \"intensity\": 2.8\n      }\n    ],\n    \"texture\": [\n      \"grippy tannins\",\n      \"structured mouthfeel\"\n    ],\n    \"finish_length\": 8.3,\n    \"complexity\": 8.2\n  },\n  \"metadata\": {\n    \"source\": \"generalized sensory profile for Barolo (Nebbiolo DOCG); not a specific producer/vintage\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:barolo-docg",
                "expected": "ogs:core:beverage:barolo-docg",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "wine",
                "expected": "wine",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2506,
            "output_tokens": 568,
            "latency_ms": 3965.9,
            "usd": 0.000353,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:barolo-docg\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Barolo\",\n  \"type\": \"wine\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0.5,\n      \"sour\": 5,\n      \"salty\": 0.5,\n      \"bitter\": 4,\n      \"umami\": 2\n    },\n    \"structural\": {\n      \"body\": 7.5,\n      \"acidity_perception\": 6,\n      \"tannin\": 7.5,\n      \"astringency\": 7,\n      \"alcohol_heat\": 3,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"red fruit\",\n        \"item\": \"cherry\",\n        \"intensity\": 6\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"dried fruit\",\n        \"item\": \"dried cherry\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 4\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"rose\",\n        \"item\": \"dried rose\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 3\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"aromatic spice\",\n        \"item\": \"tar\",\n        \"intensity\": 2.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"baking spice\",\n        \"item\": \"cocoa\",\n        \"intensity\": 2.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"forest floor\",\n        \"item\": \"truffle\",\n        \"intensity\": 3\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"barrel\",\n        \"item\": \"cedar\",\n        \"intensity\": 3\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"dried herbs\",\n        \"item\": \"dried oregano\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 2\n      }\n    ],\n    \"texture\": [\n      \"dry\",\n      \"grippy\",\n      \"structured\"\n    ],\n    \"finish_length\": 8,\n    \"complexity\": 8.5\n  },\n  \"primary_language\": \"en\",\n  \"available_languages\": [\n    \"en\",\n    \"it\"\n  ],\n  \"description\": \"A traditional Italian wine (DOCG) from Piedmont, typically crafted from Nebbiolo, known for high tannins, pronounced acidity, and long aging potential.\"\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:barolo-docg",
                "expected": "ogs:core:beverage:barolo-docg",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "wine",
                "expected": "wine",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2506,
            "output_tokens": 603,
            "latency_ms": 3803.6,
            "usd": 0.000367,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:barolo-docg\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Barolo\",\n  \"type\": \"wine\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0,\n      \"sour\": 4,\n      \"salty\": 0,\n      \"bitter\": 3,\n      \"umami\": 2\n    },\n    \"structural\": {\n      \"body\": 7,\n      \"acidity_perception\": 6,\n      \"tannin\": 7,\n      \"astringency\": 6,\n      \"alcohol_heat\": 4,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"red fruit\",\n        \"item\": \"cherry\",\n        \"intensity\": 6\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"dark fruit\",\n        \"item\": \"blackberry\",\n        \"intensity\": 5\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"dried florals\",\n        \"item\": \"rose\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 3\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"botanical\",\n        \"item\": \"tar\",\n        \"intensity\": 2\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"wet earth\",\n        \"intensity\": 4\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"cinnamon\",\n        \"intensity\": 3\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"barrel\",\n        \"item\": \"vanilla\",\n        \"intensity\": 2\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"barrel\",\n        \"item\": \"cedar\",\n        \"intensity\": 2\n      }\n    ],\n    \"texture\": [\n      \"grippy\",\n      \"silky\",\n      \"dry\"\n    ],\n    \"finish_length\": 8,\n    \"complexity\": 8\n  },\n  \"region\": \"Piedmont, Italy\",\n  \"appellation\": \"Barolo DOCG\",\n  \"producer\": \"varies\",\n  \"abv\": 13.5\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:barolo-docg",
                "expected": "ogs:core:beverage:barolo-docg",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "wine",
                "expected": "wine",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2506,
            "output_tokens": 544,
            "latency_ms": 3758.8,
            "usd": 0.000343,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 2506.0,
        "output_tokens_mean": 571.667,
        "latency_ms_mean": 3842.767,
        "usd_sum": 0.001063
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-012",
      "category": "structured_output",
      "capability": "schema_valid_beverage",
      "difficulty": "hard",
      "cuisine_region": "scottish",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.beverage",
        "vocab.beverage_types",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "bd330891a3e68b59",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS beverage schema (`schema/beverage.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/beverage.schema.json\",\n  \"title\": \"OGS Beverage\",\n  \"description\": \"A beverage entity with type, structural attributes, sensory profile, and provenance.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"type\",\n    \"sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"type\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"wine\",\n        \"beer\",\n        \"spirit\",\n        \"cider\",\n        \"sake\",\n        \"other\"\n      ]\n    },\n    \"subtype\": {\n      \"type\": \"string\"\n    },\n    \"grape_varieties\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"type\": \"string\"\n      }\n    },\n    \"vintage\": {\n      \"type\": \"integer\"\n    },\n    \"region\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"appellation\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"producer\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"abv\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"residual_sugar\": {\n      \"type\": \"number\",\n      \"minimum\": 0\n    },\n    \"serve_temperature_c\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"min\": {\n          \"type\": \"number\"\n        },\n        \"max\": {\n          \"type\": \"number\"\n        }\n      },\n      \"additionalProperties\": false,\n      \"description\": \"Recommended serving temperature range in degrees Celsius (SI canonical).\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document describing 'Islay single malt' as an OGS beverage. It MUST validate against the OGS beverage schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"beverage_name\": \"Islay single malt\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"ardbeg-10-islay\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/beverage.schema.json",
        "required_fields": {
          "id": "ogs:core:beverage:ardbeg-10-islay",
          "ogs_version": "0.1.0",
          "type": "spirit"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:ardbeg-10-islay\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Islay single malt\",\n  \"type\": \"spirit\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.5,\n      \"sour\": 0.5,\n      \"salty\": 1.5,\n      \"bitter\": 2.0,\n      \"umami\": 0.8\n    },\n    \"structural\": {\n      \"body\": 6.0,\n      \"acidity_perception\": 1.2,\n      \"tannin\": 1.0,\n      \"astringency\": 1.0,\n      \"alcohol_heat\": 5.5,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"marine\",\n        \"group\": \"sea-spray/iodine\",\n        \"item\": \"iodine\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"peat/smoke\",\n        \"item\": \"peat smoke\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warming spice\",\n        \"item\": \"black pepper\",\n        \"intensity\": 3.0\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"barrel\",\n        \"item\": \"toasted oak\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"sweet accents\",\n        \"item\": \"vanilla\",\n        \"intensity\": 2.5\n      }\n    ],\n    \"texture\": [\n      \"warmed\",\n      \"silky\"\n    ],\n    \"finish_length\": 7.5,\n    \"complexity\": 7.0\n  },\n  \"notes\": \"Peated Islay single malt profile with smoky peat, maritime character, and toasted oak/spice accents.\"\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:ardbeg-10-islay",
                "expected": "ogs:core:beverage:ardbeg-10-islay",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "spirit",
                "expected": "spirit",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2512,
            "output_tokens": 459,
            "latency_ms": 3207.9,
            "usd": 0.000309,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:ardbeg-10-islay\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Islay single malt\",\n  \"type\": \"spirit\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.0,\n      \"sour\": 1.0,\n      \"salty\": 3.0,\n      \"bitter\": 2.5,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 7.0,\n      \"acidity_perception\": 1.0,\n      \"tannin\": 1.5,\n      \"astringency\": 1.5,\n      \"alcohol_heat\": 6.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"marine\",\n        \"group\": \"peat-smoke\",\n        \"item\": \"briny smoke\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"peat\",\n        \"item\": \"smoldering peat\",\n        \"intensity\": 8.0\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"oak\",\n        \"item\": \"charred oak\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"smoke-spice\",\n        \"item\": \"black pepper\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"sweetened smoke\",\n        \"item\": \"toffee\",\n        \"intensity\": 2.5\n      }\n    ],\n    \"texture\": [\n      \"oily\"\n    ],\n    \"finish_length\": 8.0,\n    \"complexity\": 7.5\n  },\n  \"primary_language\": \"en\",\n  \"available_languages\": [\n    \"en\"\n  ]\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:ardbeg-10-islay",
                "expected": "ogs:core:beverage:ardbeg-10-islay",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "spirit",
                "expected": "spirit",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2512,
            "output_tokens": 447,
            "latency_ms": 3179.4,
            "usd": 0.000304,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:ardbeg-10-islay\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Islay single malt\",\n  \"type\": \"spirit\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 3.0,\n      \"sour\": 1.0,\n      \"salty\": 1.5,\n      \"bitter\": 2.0,\n      \"umami\": 0.5\n    },\n    \"structural\": {\n      \"body\": 6.5,\n      \"acidity_perception\": 1.0,\n      \"tannin\": 2.0,\n      \"astringency\": 1.5,\n      \"alcohol_heat\": 5.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"smoke\",\n        \"group\": \"peat\",\n        \"item\": \"peated smoke\",\n        \"intensity\": 8.0\n      },\n      {\n        \"family\": \"marine\",\n        \"group\": \"coastal\",\n        \"item\": \"sea spray\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"peaty\",\n        \"item\": \"damp peat\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"barrel\",\n        \"item\": \"toasted oak\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warming\",\n        \"item\": \"black pepper\",\n        \"intensity\": 2.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"sweet notes\",\n        \"item\": \"caramel\",\n        \"intensity\": 3.5\n      }\n    ],\n    \"texture\": [\n      \"oily\"\n    ],\n    \"finish_length\": 7.5,\n    \"complexity\": 7.0\n  },\n  \"description\": \"A peaty, coastal single malt style characteristic of Islay, with smoky-earthy aromatics and a long warming finish.\",\n  \"metadata\": {\n    \"source\": \"general beverage sensory profiling\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "'smoke' is not one of ['fruit', 'floral', 'herbal', 'spice', 'earth', 'wood', 'dairy', 'savory', 'confection', 'vegetal', 'marine']"
            ],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:ardbeg-10-islay",
                "expected": "ogs:core:beverage:ardbeg-10-islay",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "spirit",
                "expected": "spirit",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2512,
            "output_tokens": 514,
            "latency_ms": 3347.1,
            "usd": 0.000331,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": 0.0133,
        "ci95_hi": 1.32,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 2512.0,
        "output_tokens_mean": 473.333,
        "latency_ms_mean": 3244.8,
        "usd_sum": 0.000944
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-013",
      "category": "structured_output",
      "capability": "schema_valid_beverage",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.beverage",
        "vocab.beverage_types",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "872d23f39307a374",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS beverage schema (`schema/beverage.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/beverage.schema.json\",\n  \"title\": \"OGS Beverage\",\n  \"description\": \"A beverage entity with type, structural attributes, sensory profile, and provenance.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"type\",\n    \"sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"type\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"wine\",\n        \"beer\",\n        \"spirit\",\n        \"cider\",\n        \"sake\",\n        \"other\"\n      ]\n    },\n    \"subtype\": {\n      \"type\": \"string\"\n    },\n    \"grape_varieties\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"type\": \"string\"\n      }\n    },\n    \"vintage\": {\n      \"type\": \"integer\"\n    },\n    \"region\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"appellation\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"producer\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"abv\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"residual_sugar\": {\n      \"type\": \"number\",\n      \"minimum\": 0\n    },\n    \"serve_temperature_c\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"min\": {\n          \"type\": \"number\"\n        },\n        \"max\": {\n          \"type\": \"number\"\n        }\n      },\n      \"additionalProperties\": false,\n      \"description\": \"Recommended serving temperature range in degrees Celsius (SI canonical).\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document describing 'junmai ginjo sake' as an OGS beverage. It MUST validate against the OGS beverage schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"beverage_name\": \"junmai ginjo sake\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"junmai-ginjo-generic\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/beverage.schema.json",
        "required_fields": {
          "id": "ogs:core:beverage:junmai-ginjo-generic",
          "ogs_version": "0.1.0",
          "type": "sake"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:junmai-ginjo-generic\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"junmai ginjo sake\",\n  \"type\": \"sake\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\n    \"en\"\n  ],\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.0,\n      \"sour\": 2.5,\n      \"salty\": 0.5,\n      \"bitter\": 1.0,\n      \"umami\": 3.0\n    },\n    \"structural\": {\n      \"body\": 4.5,\n      \"acidity_perception\": 3.5,\n      \"tannin\": 0.0,\n      \"astringency\": 0.5,\n      \"alcohol_heat\": 2.5,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"zest\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"white flowers\",\n        \"item\": \"aromatic blossoms\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"savory\",\n        \"group\": \"rice-derived\",\n        \"item\": \"umami notes\",\n        \"intensity\": 2.5\n      }\n    ],\n    \"texture\": [\n      \"silky\",\n      \"smooth\"\n    ],\n    \"finish_length\": 6.5,\n    \"complexity\": 6.0\n  },\n  \"description\": \"A clean, aromatic junmai ginjo sake profile with moderate acidity, low tannin/astringency, and a smooth, lingering finish.\"\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:junmai-ginjo-generic",
                "expected": "ogs:core:beverage:junmai-ginjo-generic",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "sake",
                "expected": "sake",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2515,
            "output_tokens": 426,
            "latency_ms": 3083.6,
            "usd": 0.000296,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:junmai-ginjo-generic\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"junmai ginjo sake\",\n  \"type\": \"sake\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\n    \"en\",\n    \"ja\"\n  ],\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.0,\n      \"sour\": 2.5,\n      \"salty\": 0.8,\n      \"bitter\": 1.0,\n      \"umami\": 2.8\n    },\n    \"structural\": {\n      \"body\": 4.8,\n      \"acidity_perception\": 3.5,\n      \"tannin\": 0.0,\n      \"astringency\": 0.2,\n      \"alcohol_heat\": 2.0,\n      \"effervescence\": 0.1\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"zest\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 3.8\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"aromatic flowers\",\n        \"item\": \"white flowers\",\n        \"intensity\": 3.2\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"light sweetness\",\n        \"item\": \"rice sweetness\",\n        \"intensity\": 2.2\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"fresh rice character\",\n        \"item\": \"fresh grain\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 2.5\n      }\n    ],\n    \"texture\": [\n      \"silky\",\n      \"clean\"\n    ],\n    \"finish_length\": 5.2,\n    \"complexity\": 6.0\n  },\n  \"description\": {\n    \"en\": \"A Junmai Ginjo sake with a clean, elegant profile, delicate aromatics, and balanced acidity.\",\n    \"ja\": \"上品で繊細な香りと、きれいな飲み口、バランスの取れた酸味が特徴の純米吟醸酒。\"\n  },\n  \"notes\": \"Typical Junmai Ginjo styling emphasizes purity, aromatic lift, and a smooth, lightly rounded palate with minimal harshness.\"\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:junmai-ginjo-generic",
                "expected": "ogs:core:beverage:junmai-ginjo-generic",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "sake",
                "expected": "sake",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2515,
            "output_tokens": 540,
            "latency_ms": 3478.9,
            "usd": 0.000342,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:junmai-ginjo-generic\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"junmai ginjo sake\",\n  \"type\": \"sake\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\"],\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.5,\n      \"sour\": 2,\n      \"salty\": 0.5,\n      \"bitter\": 1.5,\n      \"umami\": 2.5\n    },\n    \"structural\": {\n      \"body\": 4.5,\n      \"acidity_perception\": 3,\n      \"tannin\": 0,\n      \"astringency\": 0.5,\n      \"alcohol_heat\": 2.5,\n      \"effervescence\": 1\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"floral\",\n        \"group\": \"aroma\",\n        \"item\": \"white blossom\",\n        \"intensity\": 3\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"aroma\",\n        \"item\": \"citrus\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"aroma\",\n        \"item\": \"rice pudding\",\n        \"intensity\": 2\n      },\n      {\n        \"family\": \"savory\",\n        \"group\": \"aroma\",\n        \"item\": \"umami notes\",\n        \"intensity\": 2\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"aroma\",\n        \"item\": \"green herb\",\n        \"intensity\": 1.5\n      }\n    ],\n    \"texture\": [\"silky\", \"clean\"],\n    \"finish_length\": 6,\n    \"complexity\": 5\n  },\n  \"abv\": 15,\n  \"serve_temperature_c\": {\n    \"min\": 5,\n    \"max\": 15\n  },\n  \"notes\": \"Junmai Ginjo style typically emphasizes fragrant, clean aromatics with a balanced, smooth palate and a dry finish.\",\n  \"metadata\": {\n    \"source\": \"generated\",\n    \"created\": \"2026-04-18T00:00:00Z\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:junmai-ginjo-generic",
                "expected": "ogs:core:beverage:junmai-ginjo-generic",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "sake",
                "expected": "sake",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2515,
            "output_tokens": 525,
            "latency_ms": 3594.3,
            "usd": 0.000336,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 2515.0,
        "output_tokens_mean": 497.0,
        "latency_ms_mean": 3385.6,
        "usd_sum": 0.000974
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-014",
      "category": "structured_output",
      "capability": "schema_valid_beverage",
      "difficulty": "easy",
      "cuisine_region": "german",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.beverage",
        "vocab.beverage_types",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "3fe3fe74f8d80260",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS beverage schema (`schema/beverage.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/beverage.schema.json\",\n  \"title\": \"OGS Beverage\",\n  \"description\": \"A beverage entity with type, structural attributes, sensory profile, and provenance.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"type\",\n    \"sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"type\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"wine\",\n        \"beer\",\n        \"spirit\",\n        \"cider\",\n        \"sake\",\n        \"other\"\n      ]\n    },\n    \"subtype\": {\n      \"type\": \"string\"\n    },\n    \"grape_varieties\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"type\": \"string\"\n      }\n    },\n    \"vintage\": {\n      \"type\": \"integer\"\n    },\n    \"region\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"appellation\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"producer\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"abv\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"residual_sugar\": {\n      \"type\": \"number\",\n      \"minimum\": 0\n    },\n    \"serve_temperature_c\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"min\": {\n          \"type\": \"number\"\n        },\n        \"max\": {\n          \"type\": \"number\"\n        }\n      },\n      \"additionalProperties\": false,\n      \"description\": \"Recommended serving temperature range in degrees Celsius (SI canonical).\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document describing 'Oktoberfest Märzen lager' as an OGS beverage. It MUST validate against the OGS beverage schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"beverage_name\": \"Oktoberfest Märzen lager\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"oktoberfest-marzen\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/beverage.schema.json",
        "required_fields": {
          "id": "ogs:core:beverage:oktoberfest-marzen",
          "ogs_version": "0.1.0",
          "type": "beer"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:oktoberfest-marzen\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Oktoberfest Märzen lager\",\n  \"type\": \"beer\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\n    \"en\"\n  ],\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 4.8,\n      \"sour\": 0.8,\n      \"salty\": 0.2,\n      \"bitter\": 4.2,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 6.5,\n      \"acidity_perception\": 1.1,\n      \"tannin\": 0.0,\n      \"astringency\": 0.2,\n      \"alcohol_heat\": 2.5,\n      \"effervescence\": 3.8\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"confection\",\n        \"group\": \"malt sweetness\",\n        \"item\": \"toasted bread\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"oak/wood impression\",\n        \"item\": \"light wood\",\n        \"intensity\": 1.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"bready spice\",\n        \"item\": \"noble-spice\",\n        \"intensity\": 1.8\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"dried fruit\",\n        \"item\": \"raisin\",\n        \"intensity\": 1.6\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"hop earthiness\",\n        \"item\": \"earthy hop\",\n        \"intensity\": 2.2\n      }\n    ],\n    \"texture\": [\n      \"medium body\",\n      \"creamy foam\"\n    ],\n    \"finish_length\": 6.0,\n    \"complexity\": 6.2\n  },\n  \"notes\": \"A classic Oktoberfest-style Märzen lager profile with pronounced toasted/bready malt character, moderate bitterness, and a clean, lager-driven finish.\",\n  \"metadata\": {\n    \"source\": \"general sensory description\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:oktoberfest-marzen",
                "expected": "ogs:core:beverage:oktoberfest-marzen",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "beer",
                "expected": "beer",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2515,
            "output_tokens": 514,
            "latency_ms": 3414.8,
            "usd": 0.000331,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:oktoberfest-marzen\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Oktoberfest Märzen lager\",\n  \"type\": \"beer\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 5.5,\n      \"sour\": 1.0,\n      \"salty\": 0.6,\n      \"bitter\": 4.5,\n      \"umami\": 1.2\n    },\n    \"structural\": {\n      \"body\": 5.8,\n      \"acidity_perception\": 1.0,\n      \"tannin\": 0.2,\n      \"astringency\": 1.0,\n      \"alcohol_heat\": 2.6,\n      \"effervescence\": 2.8\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"confection\",\n        \"group\": \"malty\",\n        \"item\": \"toasted bread\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"malty\",\n        \"item\": \"biscuit\",\n        \"intensity\": 5.2\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"malt-derived\",\n        \"item\": \"light toast\",\n        \"intensity\": 2.8\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"grain\",\n        \"item\": \"hay\",\n        \"intensity\": 1.6\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"hop\",\n        \"item\": \"noble hop herbal\",\n        \"intensity\": 2.4\n      }\n    ],\n    \"texture\": [\n      \"clean\",\n      \"moderate creaminess\"\n    ],\n    \"finish_length\": 6.0,\n    \"complexity\": 5.2\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:oktoberfest-marzen",
                "expected": "ogs:core:beverage:oktoberfest-marzen",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "beer",
                "expected": "beer",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2515,
            "output_tokens": 431,
            "latency_ms": 2949.6,
            "usd": 0.000298,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:oktoberfest-marzen\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Oktoberfest Märzen lager\",\n  \"type\": \"beer\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 6.5,\n      \"sour\": 1.5,\n      \"salty\": 1.0,\n      \"bitter\": 3.5,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 6.0,\n      \"acidity_perception\": 1.5,\n      \"tannin\": 0.2,\n      \"astringency\": 0.6,\n      \"alcohol_heat\": 2.5,\n      \"effervescence\": 3.5\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"confection\",\n        \"group\": \"malty\",\n        \"item\": \"toasted bread\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"malty\",\n        \"item\": \"caramel\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"malty\",\n        \"item\": \"biscuit\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"noble-hop character\",\n        \"item\": \"herbal\",\n        \"intensity\": 2.5\n      }\n    ],\n    \"texture\": [\n      \"medium-bodied\",\n      \"clean\",\n      \"slightly creamy foam\"\n    ],\n    \"finish_length\": 5.0,\n    \"complexity\": 5.5\n  },\n  \"primary_language\": \"en\",\n  \"available_languages\": [\n    \"en\"\n  ],\n  \"description\": \"A traditional Märzen-style Oktoberfest lager with pronounced toasted-malty character, moderate sweetness, and a balanced, gently bitter finish.\",\n  \"serve_temperature_c\": {\n    \"min\": 7,\n    \"max\": 10\n  },\n  \"metadata\": {\n    \"source\": \"ogstingestion\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:oktoberfest-marzen",
                "expected": "ogs:core:beverage:oktoberfest-marzen",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "beer",
                "expected": "beer",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 2515,
            "output_tokens": 498,
            "latency_ms": 3178.3,
            "usd": 0.000325,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 2515.0,
        "output_tokens_mean": 481.0,
        "latency_ms_mean": 3180.9,
        "usd_sum": 0.000954
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-015",
      "category": "structured_output",
      "capability": "schema_valid_pairing",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.pairing",
        "vocab.match_types",
        "vocab.explanation_codes",
        "vocab.grade_scale",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "f0c01fa20f35eb51",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS pairing schema (`schema/pairing.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/pairing.schema.json\",\n  \"title\": \"OGS Pairing\",\n  \"description\": \"A scored, explained pairing between a dish and a beverage.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"dish_ref\",\n    \"beverage_ref\",\n    \"match_types\",\n    \"overall_score\",\n    \"grade\",\n    \"explanation_codes\",\n    \"confidence\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:pairing:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"dish_ref\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:dish:[a-z0-9-]+$\"\n    },\n    \"beverage_ref\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"match_types\": {\n      \"type\": \"array\",\n      \"minItems\": 1,\n      \"items\": {\n        \"$ref\": \"#/$defs/match_type_entry\"\n      }\n    },\n    \"overall_score\": {\n      \"type\": \"integer\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"grade\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"A+\",\n        \"A\",\n        \"B+\",\n        \"B\",\n        \"C+\",\n        \"C\",\n        \"D\",\n        \"F\"\n      ]\n    },\n    \"explanation_codes\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/explanation_code_entry\"\n      }\n    },\n    \"risks\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/risk_entry\"\n      }\n    },\n    \"confidence\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 1\n    },\n    \"evidence\": {\n      \"$ref\": \"#/$defs/evidence\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"match_type_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"type\",\n        \"score\"\n      ],\n      \"properties\": {\n        \"type\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"harmony\",\n            \"contrast\",\n            \"bridge\",\n            \"intensity_alignment\",\n            \"structural_balance\",\n            \"refresh\"\n          ]\n        },\n        \"score\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 100\n        },\n        \"description\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"primary\": {\n          \"type\": \"boolean\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"explanation_code_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"code\"\n      ],\n      \"properties\": {\n        \"code\": {\n          \"type\": \"string\"\n        },\n        \"weight\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 1\n        },\n        \"detail\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"risk_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"code\",\n        \"severity\"\n      ],\n      \"properties\": {\n        \"code\": {\n          \"type\": \"string\"\n        },\n        \"severity\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"low\",\n            \"medium\",\n            \"high\"\n          ]\n        },\n        \"description\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"mitigation\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"evidence\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"method\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"expert_panel\",\n            \"instrumental\",\n            \"computed\",\n            \"estimated\"\n          ]\n        },\n        \"sources\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n### OGS Explanation Codes\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce an OGS pairing JSON document for the described match. It MUST validate against the OGS pairing schema. Respond with ONLY the JSON.\n\nINPUT:\n{\n  \"pairing_slug\": \"crottin-sancerre\",\n  \"dish_ref\": \"ogs:core:dish:crottin-de-chavignol\",\n  \"beverage_ref\": \"ogs:core:beverage:sancerre-blanc\",\n  \"intent\": \"Classic Loire regional pairing: Sancerre's acidity and citrus meet fresh goat cheese's tang.\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/pairing.schema.json",
        "required_fields": {
          "id": "ogs:core:pairing:crottin-sancerre",
          "ogs_version": "0.1.0",
          "dish_ref": "ogs:core:dish:crottin-de-chavignol",
          "beverage_ref": "ogs:core:beverage:sancerre-blanc"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:benchmark-task:pairing:crottin-sancerre\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"en\": \"Sancerre Blanc with Crottin de Chavignol\"\n  },\n  \"description\": {\n    \"en\": \"A classic Loire-style pairing: Sancerre’s high acidity and citrus notes cut through the tangy, lactic character of fresh goat cheese, balancing richness with freshness.\"\n  },\n  \"dish_ref\": \"ogs:core:dish:crottin-de-chavignol\",\n  \"beverage_ref\": \"ogs:core:beverage:sancerre-blanc\",\n  \"match_types\": [\n    {\n      \"type\": \"harmony\",\n      \"score\": 82,\n      \"description\": {\n        \"en\": \"Shared Loire sensibility and complementary overall profiles, with the wine’s bright citrus harmonizing with the cheese’s fresh tang.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"refresh\",\n      \"score\": 90,\n      \"description\": {\n        \"en\": \"Sancerre’s acidity refreshes the palate and keeps the cheese’s tang lively rather than heavy.\"\n      },\n      \"primary\": true\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 86,\n      \"description\": {\n        \"en\": \"Body and perceived intensity are well matched: the wine’s lean structure supports the cheese without being overwhelmed.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"contrast\",\n      \"score\": 78,\n      \"description\": {\n        \"en\": \"A dynamic tension forms between the wine’s bright acidity and the cheese’s tangy, creamy salinity.\"\n      },\n      \"primary\": false\n    }\n  ],\n  \"overall_score\": 88,\n  \"grade\": \"A\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"HARMONY_REGIONAL\",\n      \"weight\": 0.25,\n      \"detail\": {\n        \"en\": \"Both are strongly associated with the Loire region tradition.\"\n      }\n    },\n    {\n      \"code\": \"STRUCT_ACIDITY_BALANCE\",\n      \"weight\": 0.35,\n      \"detail\": {\n        \"en\": \"The wine’s acidity cuts through the cheese’s creamy tang and helps prevent it from feeling heavy.\"\n      }\n    },\n    {\n      \"code\": \"REFRESH_ACID\",\n      \"weight\": 0.25,\n      \"detail\": {\n        \"en\": \"Acid refresh keeps the palate clean between bites.\"\n      }\n    },\n    {\n      \"code\": \"CONTRAST_SWEET_ACID\",\n      \"weight\": 0.15,\n      \"detail\": {\n        \"en\": \"The cheese’s perceived salt/cream character is enlivened by the wine’s bright acidity.\"\n      }\n    }\n  ],\n  \"risks\": [],\n  \"confidence\": 0.78,\n  \"evidence\": {\n    \"method\": \"expert_panel\",\n    \"sources\": [\n      \"Classic Loire white wine and goat-cheese pairing practice (regional culinary consensus)\"\n    ],\n    \"notes\": {\n      \"en\": \"Fresh goat cheese benefits from high-acid, citrus-driven whites; Sancerre Blanc is a common canonical match.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Best when the crottin is served fresh and not overly aged, and the Sancerre is vibrant rather than over-oaked.\"\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"user-provided intent; pairing rationale aligned to OGS match/explanation codes\"\n  }\n}",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": "ogs:benchmark-task:pairing:crottin-sancerre",
                "expected": "ogs:core:pairing:crottin-sancerre",
                "match": false
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:crottin-de-chavignol",
                "expected": "ogs:core:dish:crottin-de-chavignol",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:sancerre-blanc",
                "expected": "ogs:core:beverage:sancerre-blanc",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3381,
            "output_tokens": 821,
            "latency_ms": 5090.2,
            "usd": 0.000497,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:crottin-sancerre\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": {\n    \"en\": \"Crottin de Chavignol with Sancerre Blanc\"\n  },\n  \"description\": {\n    \"en\": \"A classic Loire-style pairing where Sancerre’s bright acidity and citrus notes meet fresh goat cheese tang, creating a clean, lively balance.\"\n  },\n  \"dish_ref\": \"ogs:core:dish:crottin-de-chavignol\",\n  \"beverage_ref\": \"ogs:core:beverage:sancerre-blanc\",\n  \"match_types\": [\n    {\n      \"type\": \"harmony\",\n      \"score\": 84,\n      \"description\": {\n        \"en\": \"Shared Loire freshness: the dish’s tang aligns with the wine’s bright, citrus-driven profile.\"\n      },\n      \"primary\": true\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 88,\n      \"description\": {\n        \"en\": \"Sancerre acidity cuts through the cheese’s richness while echoing its tang for a balanced mouthfeel.\"\n      },\n      \"primary\": true\n    },\n    {\n      \"type\": \"refresh\",\n      \"score\": 90,\n      \"description\": {\n        \"en\": \"High-acid character refreshes and resets the palate between bites of fresh goat cheese.\"\n      },\n      \"primary\": true\n    },\n    {\n      \"type\": \"intensity_alignment\",\n      \"score\": 78,\n      \"description\": {\n        \"en\": \"Generally well-matched intensity: the wine’s acidity stays prominent without overwhelming the cheese.\"\n      },\n      \"primary\": false\n    }\n  ],\n  \"overall_score\": 88,\n  \"grade\": \"A\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"STRUCT_ACIDITY_BALANCE\",\n      \"weight\": 0.9,\n      \"detail\": {\n        \"en\": \"The wine’s acidity balances and lifts the cheese’s tang and creamy richness.\"\n      }\n    },\n    {\n      \"code\": \"REFRESH_ACID\",\n      \"weight\": 0.85,\n      \"detail\": {\n        \"en\": \"Acid refresh helps keep the pairing lively across bites.\"\n      }\n    },\n    {\n      \"code\": \"HARMONY_REGIONAL\",\n      \"weight\": 0.6,\n      \"detail\": {\n        \"en\": \"Both elements reflect classic Loire pairing logic (terroir-driven freshness).\"\n      }\n    }\n  ],\n  \"risks\": [],\n  \"confidence\": 0.78,\n  \"evidence\": {\n    \"method\": \"expert_panel\",\n    \"sources\": [\n      \"Classic regional pairing conventions for Loire Sauvignon Blanc with fresh goat cheese (crottin de Chavignol).\"\n    ],\n    \"notes\": {\n      \"en\": \"Fresh goat cheese tang typically benefits from high-acid Sauvignon Blanc characteristics.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Serve Sancerre chilled (not too cold) to preserve citrus aromatics and acidity; aim for a fresh, youthful style to match the cheese’s tang.\"\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"task-generated\",\n    \"license\": \"CC0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:pairing:crottin-sancerre",
                "expected": "ogs:core:pairing:crottin-sancerre",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:crottin-de-chavignol",
                "expected": "ogs:core:dish:crottin-de-chavignol",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:sancerre-blanc",
                "expected": "ogs:core:beverage:sancerre-blanc",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3381,
            "output_tokens": 740,
            "latency_ms": 4531.5,
            "usd": 0.000465,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:crottin-sancerre\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": {\n    \"en\": \"Sancerre Blanc × Crottin de Chavignol\"\n  },\n  \"description\": {\n    \"en\": \"Classic Loire pairing: Sancerre’s high acidity and citrusy aromatics cut through the tangy, fresh goat cheese (crottin), creating a clean, refreshing finish.\"\n  },\n  \"dish_ref\": \"ogs:core:dish:crottin-de-chavignol\",\n  \"beverage_ref\": \"ogs:core:beverage:sancerre-blanc\",\n  \"match_types\": [\n    {\n      \"type\": \"harmony\",\n      \"score\": 82,\n      \"description\": {\n        \"en\": \"Shared regional style and flavor direction: bright, tangy, and herb-lifted notes that feel cohesive with fresh goat cheese.\"\n      },\n      \"primary\": true\n    },\n    {\n      \"type\": \"refresh\",\n      \"score\": 92,\n      \"description\": {\n        \"en\": \"Sancerre acidity refreshes the palate and keeps the cheese’s tang from feeling heavy.\"\n      },\n      \"primary\": true\n    },\n    {\n      \"type\": \"contrast\",\n      \"score\": 70,\n      \"description\": {\n        \"en\": \"Citrus brightness in the wine provides lively tension against the cheese’s creamy, slightly savory tang.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 85,\n      \"description\": {\n        \"en\": \"Good intensity/body alignment with acidity balance to support the cheese’s fattiness without muting it.\"\n      },\n      \"primary\": false\n    }\n  ],\n  \"overall_score\": 89,\n  \"grade\": \"A\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"STRUCT_ACIDITY_BALANCE\",\n      \"weight\": 0.9,\n      \"detail\": {\n        \"en\": \"Wine acidity cuts through the cheese’s richness and keeps the tasting sensation fresh.\"\n      }\n    },\n    {\n      \"code\": \"REFRESH_ACID\",\n      \"weight\": 0.85,\n      \"detail\": {\n        \"en\": \"High acidity provides a palate-reset between bites.\"\n      }\n    },\n    {\n      \"code\": \"HARMONY_REGIONAL\",\n      \"weight\": 0.45,\n      \"detail\": {\n        \"en\": \"Loire regional affinity aligns wine style with the local goat-cheese profile.\"\n      }\n    }\n  ],\n  \"confidence\": 0.78,\n  \"risks\": [],\n  \"evidence\": {\n    \"method\": \"estimated\",\n    \"sources\": [\n      \"OGS intent: classic Loire pairing with acidity/citrus vs fresh goat cheese tang\"\n    ],\n    \"notes\": {\n      \"en\": \"Fresh crottin typically pairs best with high-acid Sauvignon Blanc styles like Sancerre.\"\n    }\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"user-provided intent\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:pairing:crottin-sancerre",
                "expected": "ogs:core:pairing:crottin-sancerre",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:crottin-de-chavignol",
                "expected": "ogs:core:dish:crottin-de-chavignol",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:sancerre-blanc",
                "expected": "ogs:core:beverage:sancerre-blanc",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3381,
            "output_tokens": 710,
            "latency_ms": 4431.3,
            "usd": 0.000453,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.8333,
      "score_summary": {
        "mean": 0.8333,
        "stdev": 0.2887,
        "sem": 0.1667,
        "ci95_lo": 0.5067,
        "ci95_hi": 1.16,
        "n": 3
      },
      "score_if_valid": 0.8333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3381.0,
        "output_tokens_mean": 757.0,
        "latency_ms_mean": 4684.333,
        "usd_sum": 0.001415
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-016",
      "category": "structured_output",
      "capability": "schema_valid_dish",
      "difficulty": "hard",
      "cuisine_region": "japanese",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.dish",
        "vocab.component_roles",
        "vocab.cooking_methods",
        "vocab.courses",
        "vocab.service_temperatures",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "284b25ae8f0dc3ef",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS dish schema (`schema/dish.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/dish.schema.json\",\n  \"title\": \"OGS Dish\",\n  \"description\": \"A composed dish with components, cooking methods, and overall sensory profile.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"components\",\n    \"overall_sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:dish:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"cuisine\": {\n      \"type\": \"string\",\n      \"description\": \"Free-text cuisine name (v0.1) OR a cuisine entity reference matching ogs:<ns>:cuisine:<id> (v0.2+).\"\n    },\n    \"course\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"amuse\",\n        \"appetizer\",\n        \"soup\",\n        \"salad\",\n        \"fish\",\n        \"main\",\n        \"cheese\",\n        \"dessert\",\n        \"petit_four\"\n      ]\n    },\n    \"service_temperature\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"frozen\",\n        \"cold\",\n        \"cool\",\n        \"room\",\n        \"warm\",\n        \"hot\",\n        \"very_hot\"\n      ]\n    },\n    \"components\": {\n      \"type\": \"array\",\n      \"minItems\": 1,\n      \"items\": {\n        \"$ref\": \"#/$defs/component\"\n      }\n    },\n    \"overall_sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"component\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"role\",\n        \"ingredient_ref\"\n      ],\n      \"properties\": {\n        \"role\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"main_protein\",\n            \"main_vegetable\",\n            \"starch\",\n            \"sauce\",\n            \"fat_component\",\n            \"acid_component\",\n            \"aromatic_garnish\",\n            \"seasoning\",\n            \"textural_element\"\n          ]\n        },\n        \"ingredient_ref\": {\n          \"type\": \"string\",\n          \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\"\n        },\n        \"cooking_method\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"raw\",\n            \"seared\",\n            \"grilled\",\n            \"roasted\",\n            \"braised\",\n            \"poached\",\n            \"steamed\",\n            \"fried\",\n            \"confit\",\n            \"smoked\",\n            \"fermented\",\n            \"cured\",\n            \"pickled\",\n            \"dehydrated\",\n            \"sous_vide\",\n            \"blanched\",\n            \"sauteed\",\n            \"baked\",\n            \"charred\",\n            \"emulsified\"\n          ]\n        },\n        \"maillard_index\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 5\n        },\n        \"reduction_level\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 5\n        },\n        \"proportion\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"dominant\",\n            \"significant\",\n            \"accent\",\n            \"trace\"\n          ]\n        },\n        \"sensory_contribution\": {\n          \"type\": \"object\",\n          \"properties\": {\n            \"basic_tastes\": {\n              \"$ref\": \"#/$defs/basic_tastes\"\n            },\n            \"aromatics\": {\n              \"type\": \"array\",\n              \"items\": {\n                \"$ref\": \"#/$defs/aromatic_entry\"\n              }\n            },\n            \"texture\": {\n              \"type\": \"array\",\n              \"items\": {\n                \"type\": \"string\"\n              }\n            }\n          },\n          \"additionalProperties\": true\n        },\n        \"notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Service temperatures\n\nfrozen, cold, cool, room, warm, hot, very_hot\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce an OGS dish JSON for tonkotsu ramen (pork-bone broth, wheat noodles, chashu, soft egg, nori, scallion). Respond with ONLY the JSON.\n\nINPUT:\n{\n  \"dish_name\": \"tonkotsu ramen\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"tonkotsu-ramen\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/dish.schema.json",
        "required_fields": {
          "id": "ogs:core:dish:tonkotsu-ramen",
          "ogs_version": "0.1.0",
          "course": "main"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:tonkotsu-ramen\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"tonkotsu ramen\",\n  \"course\": \"main\",\n  \"service_temperature\": \"hot\",\n  \"components\": [\n    {\n      \"role\": \"main_protein\",\n      \"ingredient_ref\": \"ogs:core:ingredient:pork-chashu\",\n      \"cooking_method\": \"braised\",\n      \"maillard_index\": 2,\n      \"reduction_level\": 1,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.5,\n          \"sour\": 0.0,\n          \"salty\": 2.5,\n          \"bitter\": 0.2,\n          \"umami\": 6.5\n        },\n        \"texture\": [\n          \"tender\",\n          \"silky\",\n          \"silky-fatty\"\n        ]\n      }\n    },\n    {\n      \"role\": \"starch\",\n      \"ingredient_ref\": \"ogs:core:ingredient:wheat-ramen-noodles\",\n      \"cooking_method\": \"steamed\",\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.8,\n          \"sour\": 0.0,\n          \"salty\": 1.2,\n          \"bitter\": 0.0,\n          \"umami\": 1.0\n        },\n        \"texture\": [\n          \"springy\",\n          \"chewy\"\n        ]\n      }\n    },\n    {\n      \"role\": \"sauce\",\n      \"ingredient_ref\": \"ogs:core:ingredient:tonkotsu-pork-bone-broth\",\n      \"cooking_method\": \"braised\",\n      \"reduction_level\": 4,\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 1.2,\n          \"sour\": 0.2,\n          \"salty\": 6.0,\n          \"bitter\": 0.3,\n          \"umami\": 8.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"savory\",\n            \"group\": \"broth depth\",\n            \"item\": \"pork-bone richness\",\n            \"intensity\": 7.5\n          },\n          {\n            \"family\": \"wood\",\n            \"group\": \"roast notes\",\n            \"item\": \"toasty pork fat\",\n            \"intensity\": 3.5\n          }\n        ],\n        \"texture\": [\n          \"creamy\",\n          \"silky\"\n        ]\n      }\n    },\n    {\n      \"role\": \"textural_element\",\n      \"ingredient_ref\": \"ogs:core:ingredient:soft-boiled-egg\",\n      \"cooking_method\": \"poached\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.3,\n          \"sour\": 0.0,\n          \"salty\": 2.2,\n          \"bitter\": 0.0,\n          \"umami\": 3.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"dairy\",\n            \"group\": \"richness\",\n            \"item\": \"silky yolk\",\n            \"intensity\": 2.5\n          }\n        ],\n        \"texture\": [\n          \"creamy center\",\n          \"tender\"\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:nori-seaweed\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.2,\n          \"sour\": 0.0,\n          \"salty\": 1.8,\n          \"bitter\": 0.5,\n          \"umami\": 3.8\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"marine\",\n            \"group\": \"sea aroma\",\n            \"item\": \"nori\",\n            \"intensity\": 3.5\n          }\n        ],\n        \"texture\": [\n          \"slightly crisp\",\n          \"chewy\"\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:scallion\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.4,\n          \"sour\": 0.1,\n          \"salty\": 0.8,\n          \"bitter\": 0.3,\n          \"umami\": 1.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"green notes\",\n            \"item\": \"scallion\",\n            \"intensity\": 3.0,\n            \"qualifier\": \"fresh\"\n          }\n        ],\n        \"texture\": [\n          \"fresh crisp\"\n        ]\n      }\n    }\n  ],\n  \"overall_sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.1,\n      \"sour\": 0.2,\n      \"salty\": 6.4,\n      \"bitter\": 0.4,\n      \"umami\": 8.3\n    },\n    \"structural\": {\n      \"body\": 8.8,\n      \"acidity_perception\": 0.8,\n      \"tannin\": 0.2,\n      \"astringency\": 0.3,\n      \"alcohol_heat\": 0.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"savory\",\n        \"group\": \"broth depth\",\n        \"item\": \"pork-bone umami\",\n        \"intensity\": 8.0\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"roast fat\",\n        \"item\": \"toasty notes\",\n        \"intensity\": 3.6\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green freshness\",\n        \"item\": \"scallion\",\n        \"intensity\": 2.8,\n        \"qualifier\": \"fresh\"\n      },\n      {\n        \"family\": \"marine\",\n        \"group\": \"sea aroma\",\n        \"item\": \"nori\",\n        \"intensity\": 3.4\n      }\n    ],\n    \"texture\": [\n      \"creamy broth\",\n      \"chewy noodles\",\n      \"tender chashu\",\n      \"silky yolk\",\n      \"slightly crisp seaweed\"\n    ],\n    \"finish_length\": 7.5,\n    \"complexity\": 7.2\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"user-input\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:dish:tonkotsu-ramen",
                "expected": "ogs:core:dish:tonkotsu-ramen",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "course": {
                "predicted": "main",
                "expected": "main",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3077,
            "output_tokens": 1565,
            "latency_ms": 8848.2,
            "usd": 0.00078,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:tonkotsu-ramen\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"en\": \"tonkotsu ramen\"\n  },\n  \"course\": \"main\",\n  \"service_temperature\": \"hot\",\n  \"components\": [\n    {\n      \"role\": \"main_protein\",\n      \"ingredient_ref\": \"ogs:core:ingredient:pork-bone-broth\",\n      \"cooking_method\": \"braised\",\n      \"proportion\": \"dominant\",\n      \"maillard_index\": 1,\n      \"reduction_level\": 5,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.8,\n          \"sour\": 0.2,\n          \"salty\": 7.8,\n          \"bitter\": 0.2,\n          \"umami\": 9.1\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"savory\",\n            \"group\": \"broth notes\",\n            \"item\": \"pork richness\",\n            \"intensity\": 8.8\n          }\n        ],\n        \"texture\": [\n          \"silky\",\n          \"viscous\"\n        ]\n      },\n      \"notes\": {\n        \"en\": \"Creamy, gelatinous pork-bone stock typical of tonkotsu style.\"\n      }\n    },\n    {\n      \"role\": \"starch\",\n      \"ingredient_ref\": \"ogs:core:ingredient:wheat-noodles\",\n      \"cooking_method\": \"blanched\",\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.3,\n          \"sour\": 0,\n          \"salty\": 1.2,\n          \"bitter\": 0.1,\n          \"umami\": 0.2\n        },\n        \"texture\": [\n          \"springy\",\n          \"chewy\"\n        ]\n      },\n      \"notes\": {\n        \"en\": \"Wheat ramen noodles with a springy chew that absorbs broth.\"\n      }\n    },\n    {\n      \"role\": \"textural_element\",\n      \"ingredient_ref\": \"ogs:core:ingredient-chashu-pork\",\n      \"cooking_method\": \"braised\",\n      \"proportion\": \"accent\",\n      \"maillard_index\": 2,\n      \"reduction_level\": 3,\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 1.6,\n          \"sour\": 0.2,\n          \"salty\": 4.8,\n          \"bitter\": 0.3,\n          \"umami\": 6.3\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"savory\",\n            \"group\": \"braise notes\",\n            \"item\": \"roasted pork depth\",\n            \"intensity\": 5.6\n          }\n        ],\n        \"texture\": [\n          \"tender\",\n          \"melt-in-mouth\"\n        ]\n      },\n      \"notes\": {\n        \"en\": \"Sliced braised pork belly/loin for tender richness.\"\n      }\n    },\n    {\n      \"role\": \"textural_element\",\n      \"ingredient_ref\": \"ogs:core:ingredient-soft-boiled-egg\",\n      \"cooking_method\": \"poached\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.4,\n          \"sour\": 0,\n          \"salty\": 3.4,\n          \"bitter\": 0.1,\n          \"umami\": 3.1\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"dairy\",\n            \"group\": \"egg richness\",\n            \"item\": \"creamy yolk\",\n            \"intensity\": 4.2\n          }\n        ],\n        \"texture\": [\n          \"jammy yolk\",\n          \"silky\"\n        ]\n      },\n      \"notes\": {\n        \"en\": \"Soft-boiled egg with a jammy yolk (often seasoned/marinaded).\"\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient-nori-seaweed\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"trace\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.2,\n          \"sour\": 0,\n          \"salty\": 2.8,\n          \"bitter\": 0.2,\n          \"umami\": 4.9\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"marine\",\n            \"group\": \"seaweed\",\n            \"item\": \"nori umami\",\n            \"intensity\": 5.1\n          }\n        ],\n        \"texture\": [\n          \"crisp\",\n          \"lean\"\n        ]\n      },\n      \"notes\": {\n        \"en\": \"Toasted seaweed for salty oceanic umami and snap.\"\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient-scallion\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.4,\n          \"sour\": 0.1,\n          \"salty\": 0.8,\n          \"bitter\": 0.3,\n          \"umami\": 0.6\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"green notes\",\n            \"item\": \"fresh scallion\",\n            \"intensity\": 4.6\n          }\n        ],\n        \"texture\": [\n          \"fresh crunch\"\n        ]\n      },\n      \"notes\": {\n        \"en\": \"Fresh green onion adds brightness and aromatic lift.\"\n      }\n    }\n  ],\n  \"overall_sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.2,\n      \"sour\": 0.2,\n      \"salty\": 7.6,\n      \"bitter\": 0.3,\n      \"umami\": 9.3\n    },\n    \"structural\": {\n      \"body\": 9.2,\n      \"acidity_perception\": 0.8,\n      \"tannin\": 0.3,\n      \"astringency\": 0.4,\n      \"alcohol_heat\": 0.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"savory\",\n        \"group\": \"broth\",\n        \"item\": \"pork-bone richness\",\n        \"intensity\": 8.9\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"garnish\",\n        \"item\": \"scallion freshness\",\n        \"intensity\": 4.6\n      },\n      {\n        \"family\": \"marine\",\n        \"group\": \"seaweed\",\n        \"item\": \"nori umami\",\n        \"intensity\": 5.0\n      }\n    ],\n    \"texture\": [\n      \"silky\",\n      \"chewy noodles\",\n      \"tender chashu\",\n      \"jammy egg\",\n      \"light seaweed snap\"\n    ],\n    \"finish_length\": 6.6,\n    \"complexity\": 7.8\n  },\n  \"metadata\": {\n    \"source\": \"user-provided\",\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\"\n  }\n}",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "'ogs:core:ingredient-scallion' does not match '^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$'"
            ],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:dish:tonkotsu-ramen",
                "expected": "ogs:core:dish:tonkotsu-ramen",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "course": {
                "predicted": "main",
                "expected": "main",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3077,
            "output_tokens": 1697,
            "latency_ms": 9668.6,
            "usd": 0.000833,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:tonkotsu-ramen\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": \"tonkotsu ramen\",\n  \"description\": \"Japanese tonkotsu ramen with pork-bone broth, wheat noodles, chashu pork, a soft-boiled egg, nori, and scallion.\",\n  \"course\": \"main\",\n  \"service_temperature\": \"hot\",\n  \"components\": [\n    {\n      \"role\": \"main_protein\",\n      \"ingredient_ref\": \"ogs:core:ingredient:pork-chashu\",\n      \"cooking_method\": \"braised\",\n      \"maillard_index\": 2,\n      \"reduction_level\": 3,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.4,\n          \"sour\": 0,\n          \"salty\": 4.5,\n          \"bitter\": 0.3,\n          \"umami\": 7.5\n        },\n        \"texture\": [\n          \"tender\",\n          \"silky-fat melt\"\n        ],\n        \"aromatics\": [\n          {\n            \"family\": \"savory\",\n            \"group\": \"broth-spice\",\n            \"item\": \"pork-rich umami\",\n            \"intensity\": 7.0\n          }\n        ]\n      }\n    },\n    {\n      \"role\": \"starch\",\n      \"ingredient_ref\": \"ogs:core:ingredient:wheat-ramen-noodles\",\n      \"cooking_method\": \"boiled\",\n      \"maillard_index\": 0,\n      \"reduction_level\": 0,\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.6,\n          \"sour\": 0,\n          \"salty\": 1.5,\n          \"bitter\": 0,\n          \"umami\": 1.0\n        },\n        \"texture\": [\n          \"springy\",\n          \"slightly chewy\"\n        ],\n        \"aromatics\": [\n          {\n            \"family\": \"confection\",\n            \"group\": \"grain\",\n            \"item\": \"wheat aroma\",\n            \"intensity\": 1.5\n          }\n        ]\n      }\n    },\n    {\n      \"role\": \"sauce\",\n      \"ingredient_ref\": \"ogs:core:ingredient/tonkotsu-pork-bone-broth\",\n      \"cooking_method\": \"braised\",\n      \"maillard_index\": 1,\n      \"reduction_level\": 5,\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.9,\n          \"sour\": 0.4,\n          \"salty\": 5.2,\n          \"bitter\": 0.2,\n          \"umami\": 8.6\n        },\n        \"texture\": [\n          \"creamy\",\n          \"coating\"\n        ],\n        \"aromatics\": [\n          {\n            \"family\": \"savory\",\n            \"group\": \"bone-broth\",\n            \"item\": \"pork-bone richness\",\n            \"intensity\": 8.5\n          },\n          {\n            \"family\": \"wood\",\n            \"group\": \"roasted-note\",\n            \"item\": \"slow-cooked depth\",\n            \"intensity\": 3.8\n          }\n        ]\n      }\n    },\n    {\n      \"role\": \"fat_component\",\n      \"ingredient_ref\": \"ogs:core:ingredient/tonkotsu-broth-coating-fat\",\n      \"cooking_method\": \"confit\",\n      \"maillard_index\": 0,\n      \"reduction_level\": 0,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0,\n          \"sour\": 0,\n          \"salty\": 0.6,\n          \"bitter\": 0,\n          \"umami\": 1.6\n        },\n        \"texture\": [\n          \"silky\",\n          \"rich mouthfeel\"\n        ],\n        \"aromatics\": [\n          {\n            \"family\": \"dairy\",\n            \"group\": \"creamy\",\n            \"item\": \"fat-driven richness\",\n            \"intensity\": 4.2\n          }\n        ]\n      }\n    },\n    {\n      \"role\": \"textural_element\",\n      \"ingredient_ref\": \"ogs:core:ingredient/soft-boiled-egg\",\n      \"cooking_method\": \"poached\",\n      \"maillard_index\": 0,\n      \"reduction_level\": 0,\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.2,\n          \"sour\": 0,\n          \"salty\": 2.3,\n          \"bitter\": 0,\n          \"umami\": 2.6\n        },\n        \"texture\": [\n          \"creamy yolk\",\n          \"tender white\"\n        ],\n        \"aromatics\": [\n          {\n            \"family\": \"dairy\",\n            \"group\": \"egg\",\n            \"item\": \"rich yolk aroma\",\n            \"intensity\": 3.0\n          }\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient/nori-dried-seaweed\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"reduction_level\": 0,\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.1,\n          \"sour\": 0,\n          \"salty\": 2.6,\n          \"bitter\": 0.2,\n          \"umami\": 3.9\n        },\n        \"texture\": [\n          \"slight chew\"\n        ],\n        \"aromatics\": [\n          {\n            \"family\": \"marine\",\n            \"group\": \"seaweed\",\n            \"item\": \"nori umami\",\n            \"intensity\": 3.8\n          }\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient/scallion\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"reduction_level\": 0,\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.3,\n          \"sour\": 0,\n          \"salty\": 0.6,\n          \"bitter\": 0.4,\n          \"umami\": 0.4\n        },\n        \"texture\": [\n          \"crisp-fresh\"\n        ],\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"allium\",\n            \"item\": \"fresh scallion\",\n            \"intensity\": 4.0\n          }\n        ]\n      }\n    }\n  ],\n  \"overall_sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0.9,\n      \"sour\": 0.5,\n      \"salty\": 5.3,\n      \"bitter\": 0.3,\n      \"umami\": 8.8\n    },\n    \"structural\": {\n      \"body\": 8.2,\n      \"acidity_perception\": 1.0,\n      \"tannin\": 0.0,\n      \"astringency\": 0.2,\n      \"alcohol_heat\": 0.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"savory\",\n        \"group\": \"bone-broth\",\n        \"item\": \"pork-bone richness\",\n        \"intensity\": 8.6\n      },\n      {\n        \"family\": \"dairy\",\n        \"group\": \"creamy\",\n        \"item\": \"milky-fat note\",\n        \"intensity\": 4.8\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"allium\",\n        \"item\": \"fresh scallion\",\n        \"intensity\": 3.8\n      },\n      {\n        \"family\": \"marine\",\n        \"group\": \"seaweed\",\n        \"item\": \"nori umami\",\n        \"intensity\": 3.5\n      }\n    ],\n    \"texture\": [\n      \"creamy coating broth\",\n      \"springy noodles\",\n      \"tender chashu\",\n      \"creamy soft yolk\",\n      \"light seaweed chew\",\n      \"fresh scallion lift\"\n    ],\n    \"finish_length\": 5.8,\n    \"complexity\": 7.2\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"user-provided dish description\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "'ogs:core:ingredient/scallion' does not match '^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$'"
            ],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:dish:tonkotsu-ramen",
                "expected": "ogs:core:dish:tonkotsu-ramen",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "course": {
                "predicted": "main",
                "expected": "main",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3077,
            "output_tokens": 1981,
            "latency_ms": 12700.7,
            "usd": 0.000946,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.3333,
      "score_summary": {
        "mean": 0.3333,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": -0.32,
        "ci95_hi": 0.9867,
        "n": 3
      },
      "score_if_valid": 0.3333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3077.0,
        "output_tokens_mean": 1747.667,
        "latency_ms_mean": 10405.833,
        "usd_sum": 0.002559
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-017",
      "category": "structured_output",
      "capability": "schema_valid_dish",
      "difficulty": "medium",
      "cuisine_region": "mexican",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.dish",
        "vocab.component_roles",
        "vocab.cooking_methods",
        "vocab.courses",
        "vocab.service_temperatures",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "534d8590759d6dc3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS dish schema (`schema/dish.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/dish.schema.json\",\n  \"title\": \"OGS Dish\",\n  \"description\": \"A composed dish with components, cooking methods, and overall sensory profile.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"components\",\n    \"overall_sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:dish:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"cuisine\": {\n      \"type\": \"string\",\n      \"description\": \"Free-text cuisine name (v0.1) OR a cuisine entity reference matching ogs:<ns>:cuisine:<id> (v0.2+).\"\n    },\n    \"course\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"amuse\",\n        \"appetizer\",\n        \"soup\",\n        \"salad\",\n        \"fish\",\n        \"main\",\n        \"cheese\",\n        \"dessert\",\n        \"petit_four\"\n      ]\n    },\n    \"service_temperature\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"frozen\",\n        \"cold\",\n        \"cool\",\n        \"room\",\n        \"warm\",\n        \"hot\",\n        \"very_hot\"\n      ]\n    },\n    \"components\": {\n      \"type\": \"array\",\n      \"minItems\": 1,\n      \"items\": {\n        \"$ref\": \"#/$defs/component\"\n      }\n    },\n    \"overall_sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"component\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"role\",\n        \"ingredient_ref\"\n      ],\n      \"properties\": {\n        \"role\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"main_protein\",\n            \"main_vegetable\",\n            \"starch\",\n            \"sauce\",\n            \"fat_component\",\n            \"acid_component\",\n            \"aromatic_garnish\",\n            \"seasoning\",\n            \"textural_element\"\n          ]\n        },\n        \"ingredient_ref\": {\n          \"type\": \"string\",\n          \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\"\n        },\n        \"cooking_method\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"raw\",\n            \"seared\",\n            \"grilled\",\n            \"roasted\",\n            \"braised\",\n            \"poached\",\n            \"steamed\",\n            \"fried\",\n            \"confit\",\n            \"smoked\",\n            \"fermented\",\n            \"cured\",\n            \"pickled\",\n            \"dehydrated\",\n            \"sous_vide\",\n            \"blanched\",\n            \"sauteed\",\n            \"baked\",\n            \"charred\",\n            \"emulsified\"\n          ]\n        },\n        \"maillard_index\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 5\n        },\n        \"reduction_level\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 5\n        },\n        \"proportion\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"dominant\",\n            \"significant\",\n            \"accent\",\n            \"trace\"\n          ]\n        },\n        \"sensory_contribution\": {\n          \"type\": \"object\",\n          \"properties\": {\n            \"basic_tastes\": {\n              \"$ref\": \"#/$defs/basic_tastes\"\n            },\n            \"aromatics\": {\n              \"type\": \"array\",\n              \"items\": {\n                \"$ref\": \"#/$defs/aromatic_entry\"\n              }\n            },\n            \"texture\": {\n              \"type\": \"array\",\n              \"items\": {\n                \"type\": \"string\"\n              }\n            }\n          },\n          \"additionalProperties\": true\n        },\n        \"notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Service temperatures\n\nfrozen, cold, cool, room, warm, hot, very_hot\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce an OGS dish JSON for pico de gallo (tomato, onion, cilantro, jalapeño, lime). JSON only.\n\nINPUT:\n{\n  \"dish_name\": \"pico de gallo\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"pico-de-gallo\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/dish.schema.json",
        "required_fields": {
          "id": "ogs:core:dish:pico-de-gallo",
          "ogs_version": "0.1.0"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:pico-de-gallo\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"pico de gallo\",\n  \"course\": \"salad\",\n  \"service_temperature\": \"cold\",\n  \"components\": [\n    {\n      \"role\": \"main_vegetable\",\n      \"ingredient_ref\": \"ogs:core:ingredient:tomato\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 3.0,\n          \"sour\": 2.5,\n          \"salty\": 0.5,\n          \"bitter\": 0.5,\n          \"umami\": 2.0\n        },\n        \"texture\": [\n          \"juicy\",\n          \"chunky\"\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:onion\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 1.5,\n          \"sour\": 0.8,\n          \"salty\": 0.5,\n          \"bitter\": 1.3,\n          \"umami\": 1.8\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"earth\",\n            \"group\": \"onion\",\n            \"item\": \"sharp allium note\",\n            \"intensity\": 4.0,\n            \"qualifier\": \"fresh\"\n          }\n        ],\n        \"texture\": [\n          \"crisp\"\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:cilantro\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.5,\n          \"sour\": 0.2,\n          \"salty\": 0.2,\n          \"bitter\": 0.8,\n          \"umami\": 0.4\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"leafy\",\n            \"item\": \"coriander-like freshness\",\n            \"intensity\": 5.5,\n            \"qualifier\": \"fresh\"\n          }\n        ],\n        \"texture\": [\n          \"leafy\"\n        ]\n      }\n    },\n    {\n      \"role\": \"seasoning\",\n      \"ingredient_ref\": \"ogs:core:ingredient:jalapeno\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.8,\n          \"sour\": 0.2,\n          \"salty\": 0.2,\n          \"bitter\": 1.0,\n          \"umami\": 0.3\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"spice\",\n            \"group\": \"chili\",\n            \"item\": \"green pepper heat\",\n            \"intensity\": 4.8,\n            \"qualifier\": \"fresh\"\n          }\n        ],\n        \"texture\": [\n          \"slightly fibrous\"\n        ]\n      }\n    },\n    {\n      \"role\": \"acid_component\",\n      \"ingredient_ref\": \"ogs:core:ingredient:lime\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.2,\n          \"sour\": 6.5,\n          \"salty\": 0.1,\n          \"bitter\": 0.4,\n          \"umami\": 0.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"fruit\",\n            \"group\": \"citrus\",\n            \"item\": \"lime zest / juice\",\n            \"intensity\": 6.0,\n            \"qualifier\": \"zest\"\n          }\n        ],\n        \"texture\": [\n          \"watery acidity\"\n        ]\n      }\n    }\n  ],\n  \"overall_sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.5,\n      \"sour\": 6.2,\n      \"salty\": 0.8,\n      \"bitter\": 1.0,\n      \"umami\": 1.8\n    },\n    \"structural\": {\n      \"body\": 2.8,\n      \"acidity_perception\": 6.8,\n      \"tannin\": 0.3,\n      \"astringency\": 0.6,\n      \"alcohol_heat\": 0.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"herbal\",\n        \"group\": \"leafy\",\n        \"item\": \"cilantro freshness\",\n        \"intensity\": 5.6,\n        \"qualifier\": \"fresh\"\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lime zest\",\n        \"intensity\": 6.0,\n        \"qualifier\": \"zest\"\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"chili\",\n        \"item\": \"jalapeño green heat\",\n        \"intensity\": 4.8,\n        \"qualifier\": \"fresh\"\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"allium\",\n        \"item\": \"raw onion sharpness\",\n        \"intensity\": 4.0,\n        \"qualifier\": \"fresh\"\n      }\n    ],\n    \"texture\": [\n      \"juicy\",\n      \"crisp\",\n      \"chunky\"\n    ],\n    \"finish_length\": 3.0,\n    \"complexity\": 5.2\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:dish:pico-de-gallo",
                "expected": "ogs:core:dish:pico-de-gallo",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3062,
            "output_tokens": 1315,
            "latency_ms": 7948.7,
            "usd": 0.000679,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:pico-de-gallo\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"es\",\n  \"name\": {\n    \"primary_language\": \"es\",\n    \"labels\": [\n      {\n        \"value\": \"pico de gallo\",\n        \"language\": \"es\",\n        \"role\": \"preferred\"\n      }\n    ]\n  },\n  \"description\": {\n    \"primary_language\": \"es\",\n    \"labels\": [\n      {\n        \"value\": \"Ensalada fresca de tomate, cebolla, cilantro, jalapeño y lima, servida como acompañamiento.\",\n        \"language\": \"es\",\n        \"role\": \"descriptive\"\n      }\n    ]\n  },\n  \"course\": \"salad\",\n  \"service_temperature\": \"cold\",\n  \"components\": [\n    {\n      \"role\": \"main_vegetable\",\n      \"ingredient_ref\": \"ogs:core:ingredient:tomato\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 2.5,\n          \"sour\": 2.0,\n          \"salty\": 0.5,\n          \"bitter\": 0.4,\n          \"umami\": 1.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"tomato\",\n            \"item\": \"fresh tomato\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 4.8\n          }\n        ],\n        \"texture\": [\n          \"juicy\",\n          \"watery\",\n          \"chopped pieces\"\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:onion\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 1.2,\n          \"sour\": 0.6,\n          \"salty\": 0.4,\n          \"bitter\": 1.6,\n          \"umami\": 0.8\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"earth\",\n            \"group\": \"allium\",\n            \"item\": \"raw onion\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 3.6\n          }\n        ],\n        \"texture\": [\n          \"crisp\",\n          \"snappy\"\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient-cilantro\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.8,\n          \"sour\": 0.2,\n          \"salty\": 0.2,\n          \"bitter\": 0.9,\n          \"umami\": 0.1\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"leafy herb\",\n            \"item\": \"cilantro\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 5.6\n          }\n        ],\n        \"texture\": [\n          \"leafy\",\n          \"fresh\"\n        ]\n      }\n    },\n    {\n      \"role\": \"seasoning\",\n      \"ingredient_ref\": \"ogs:core:ingredient-jalapeno\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.5,\n          \"sour\": 0.2,\n          \"salty\": 0.2,\n          \"bitter\": 0.8,\n          \"umami\": 0.1\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"spice\",\n            \"group\": \"chili\",\n            \"item\": \"jalapeño\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 4.2\n          }\n        ],\n        \"texture\": [\n          \"small spicy pieces\",\n          \"slightly crisp\"\n        ]\n      }\n    },\n    {\n      \"role\": \"acid_component\",\n      \"ingredient_ref\": \"ogs:core:ingredient-lime\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.3,\n          \"sour\": 6.4,\n          \"salty\": 0.1,\n          \"bitter\": 0.2,\n          \"umami\": 0.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"fruit\",\n            \"group\": \"citrus\",\n            \"item\": \"lime zest/juice\",\n            \"qualifier\": \"zest\",\n            \"intensity\": 6.0\n          }\n        ],\n        \"texture\": [\n          \"juicy\",\n          \"tangy liquid coating\"\n        ]\n      }\n    }\n  ],\n  \"overall_sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.0,\n      \"sour\": 5.8,\n      \"salty\": 1.2,\n      \"bitter\": 1.4,\n      \"umami\": 0.9\n    },\n    \"structural\": {\n      \"body\": 2.8,\n      \"acidity_perception\": 6.2,\n      \"tannin\": 0.3,\n      \"astringency\": 0.5,\n      \"alcohol_heat\": 0.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"tomato\",\n        \"item\": \"fresh tomato aroma\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.6\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"leafy herb\",\n        \"item\": \"cilantro\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.6\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"chili\",\n        \"item\": \"jalapeño\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lime\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 6.0\n      }\n    ],\n    \"texture\": [\n      \"fresh chopped vegetables\",\n      \"juicy tomato\",\n      \"crisp onion\",\n      \"leafy cilantro\"\n    ],\n    \"finish_length\": 3.2,\n    \"complexity\": 4.8\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"user-provided ingredients\"\n  }\n}",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "'ogs:core:ingredient-lime' does not match '^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$'"
            ],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:dish:pico-de-gallo",
                "expected": "ogs:core:dish:pico-de-gallo",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3062,
            "output_tokens": 1553,
            "latency_ms": 9180.4,
            "usd": 0.000774,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:pico-de-gallo\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"en\": \"pico de gallo\"\n  },\n  \"course\": \"appetizer\",\n  \"service_temperature\": \"room\",\n  \"components\": [\n    {\n      \"role\": \"main_vegetable\",\n      \"ingredient_ref\": \"ogs:core:ingredient:tomato\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 2.0,\n          \"sour\": 2.5,\n          \"salty\": 0.5,\n          \"bitter\": 0.3,\n          \"umami\": 1.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"fruit\",\n            \"group\": \"tomato\",\n            \"item\": \"tomato\",\n            \"intensity\": 3.0,\n            \"qualifier\": \"cooked\"\n          }\n        ],\n        \"texture\": [\n          \"juicy\",\n          \"soft pieces\"\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:onion\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 1.5,\n          \"sour\": 0.8,\n          \"salty\": 0.4,\n          \"bitter\": 1.2,\n          \"umami\": 0.8\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"allium\",\n            \"item\": \"onion\",\n            \"intensity\": 3.0,\n            \"qualifier\": \"fresh\"\n          }\n        ],\n        \"texture\": [\n          \"crisp\",\n          \"snap\"\n        ]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient-cilantro\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.8,\n          \"sour\": 0.3,\n          \"salty\": 0.2,\n          \"bitter\": 0.6,\n          \"umami\": 0.2\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"leafy\",\n            \"item\": \"cilantro\",\n            \"intensity\": 4.5,\n            \"qualifier\": \"fresh\"\n          }\n        ],\n        \"texture\": [\n          \"leafy\",\n          \"tender\"\n        ]\n      }\n    },\n    {\n      \"role\": \"seasoning\",\n      \"ingredient_ref\": \"ogs:core:ingredient:jalapeno\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.7,\n          \"sour\": 0.2,\n          \"salty\": 0.2,\n          \"bitter\": 0.8,\n          \"umami\": 0.1\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"spice\",\n            \"group\": \"chili\",\n            \"item\": \"jalapeño\",\n            \"intensity\": 4.0,\n            \"qualifier\": \"fresh\"\n          }\n        ],\n        \"texture\": [\n          \"crisp pepper pieces\"\n        ]\n      }\n    },\n    {\n      \"role\": \"acid_component\",\n      \"ingredient_ref\": \"ogs:core:ingredient:lime\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.3,\n          \"sour\": 5.0,\n          \"salty\": 0.2,\n          \"bitter\": 0.2,\n          \"umami\": 0.1\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"fruit\",\n            \"group\": \"citrus\",\n            \"item\": \"lime\",\n            \"intensity\": 4.8,\n            \"qualifier\": \"zest\"\n          }\n        ],\n        \"texture\": [\n          \"watery brightness\"\n        ]\n      }\n    }\n  ],\n  \"overall_sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.0,\n      \"sour\": 5.0,\n      \"salty\": 0.8,\n      \"bitter\": 1.0,\n      \"umami\": 1.2\n    },\n    \"structural\": {\n      \"body\": 3.0,\n      \"acidity_perception\": 6.0,\n      \"tannin\": 0.2,\n      \"astringency\": 0.5,\n      \"alcohol_heat\": 0.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"fresh cut\",\n        \"item\": \"tomato\",\n        \"intensity\": 3.5,\n        \"qualifier\": \"fresh\"\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"leafy\",\n        \"item\": \"cilantro\",\n        \"intensity\": 4.5,\n        \"qualifier\": \"fresh\"\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"chili\",\n        \"item\": \"jalapeño\",\n        \"intensity\": 4.0,\n        \"qualifier\": \"fresh\"\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lime zest\",\n        \"intensity\": 4.8,\n        \"qualifier\": \"zest\"\n      }\n    ],\n    \"texture\": [\n      \"juicy\",\n      \"crisp pieces\",\n      \"fresh chopped texture\"\n    ],\n    \"finish_length\": 3.0,\n    \"complexity\": 4.0\n  },\n  \"notes\": {\n    \"en\": \"Fresh chopped salsa with bright lime acidity, herbal cilantro lift, and a noticeable jalapeño heat.\"\n  },\n  \"metadata\": {\n    \"source\": \"user-input\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "'ogs:core:ingredient-cilantro' does not match '^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$'"
            ],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:dish:pico-de-gallo",
                "expected": "ogs:core:dish:pico-de-gallo",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3062,
            "output_tokens": 1440,
            "latency_ms": 8402.4,
            "usd": 0.000729,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.3333,
      "score_summary": {
        "mean": 0.3333,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": -0.32,
        "ci95_hi": 0.9867,
        "n": 3
      },
      "score_if_valid": 0.3333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3062.0,
        "output_tokens_mean": 1436.0,
        "latency_ms_mean": 8510.5,
        "usd_sum": 0.002182
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-018",
      "category": "structured_output",
      "capability": "sensory_profile_json",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "json_match",
      "context_needs": [
        "scales.basic_tastes",
        "scales.structural",
        "vocab.aromatic_families",
        "vocab.qualifiers"
      ],
      "prompt_template_hash": "1470a28b8e9ef028",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Aromatic qualifiers\n\nfresh, dried, cooked, candied, fermented, zest, overripe\n\n---\n\nProduce a JSON sensory profile for 24-month aged Comté cheese. Keys: basic_tastes, structural, aromatics. All intensities on 0-10. JSON only.\n\nINPUT:\n{\n  \"item\": \"24-month aged Comté\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "value": {
          "basic_tastes": {
            "sweet": 2.0,
            "sour": 2.0,
            "salty": 5.0,
            "bitter": 1.5,
            "umami": 8.0
          },
          "structural": {
            "body": 7.0,
            "acidity_perception": 3.0,
            "tannin": 0,
            "astringency": 0,
            "alcohol_heat": 0,
            "effervescence": 0
          }
        },
        "required_keys": [
          "basic_tastes.umami",
          "basic_tastes.salty",
          "structural.body"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 1.0,\n    \"sour\": 2.5,\n    \"salty\": 6.5,\n    \"bitter\": 1.5,\n    \"umami\": 7.5\n  },\n  \"structural\": {\n    \"body\": 7.0,\n    \"acidity_perception\": 3.0,\n    \"tannin\": 0.5,\n    \"astringency\": 1.5,\n    \"alcohol_heat\": 0.5,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": {\n    \"fruit\": 1.5,\n    \"floral\": 0.5,\n    \"herbal\": 0.5,\n    \"spice\": 1.0,\n    \"earth\": 3.5,\n    \"wood\": 2.0,\n    \"dairy\": 7.0,\n    \"savory\": 6.5,\n    \"confection\": 0.5,\n    \"vegetal\": 1.0,\n    \"marine\": 0.0,\n    \"fresh\": 1.0,\n    \"dried\": 4.0,\n    \"cooked\": 2.5,\n    \"candied\": 0.5,\n    \"fermented\": 5.5,\n    \"zest\": 0.5,\n    \"overripe\": 1.0\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.umami": {
                "predicted": 7.5,
                "expected": 8.0,
                "match": true
              },
              "basic_tastes.salty": {
                "predicted": 6.5,
                "expected": 5.0,
                "match": true
              },
              "structural.body": {
                "predicted": 7.0,
                "expected": 7.0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 453,
            "output_tokens": 320,
            "latency_ms": 2534.5,
            "usd": 0.000151,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 1.2,\n    \"sour\": 2.0,\n    \"salty\": 6.8,\n    \"bitter\": 2.6,\n    \"umami\": 8.4\n  },\n  \"structural\": {\n    \"body\": 7.6,\n    \"acidity_perception\": 2.2,\n    \"tannin\": 1.0,\n    \"astringency\": 1.6,\n    \"alcohol_heat\": 0.2,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": {\n    \"families\": {\n      \"fruit\": 2.2,\n      \"floral\": 1.0,\n      \"herbal\": 1.5,\n      \"spice\": 3.0,\n      \"earth\": 3.8,\n      \"wood\": 2.6,\n      \"dairy\": 7.2,\n      \"savory\": 8.2,\n      \"confection\": 0.8,\n      \"vegetal\": 1.6,\n      \"marine\": 0.1\n    },\n    \"qualifiers\": {\n      \"fresh\": 0.8,\n      \"dried\": 4.0,\n      \"cooked\": 3.2,\n      \"candied\": 0.3,\n      \"fermented\": 4.8,\n      \"zest\": 0.2,\n      \"overripe\": 1.4\n    }\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.umami": {
                "predicted": 8.4,
                "expected": 8.0,
                "match": true
              },
              "basic_tastes.salty": {
                "predicted": 6.8,
                "expected": 5.0,
                "match": true
              },
              "structural.body": {
                "predicted": 7.6,
                "expected": 7.0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 453,
            "output_tokens": 336,
            "latency_ms": 2682.6,
            "usd": 0.000157,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 1.2,\n    \"sour\": 2.0,\n    \"salty\": 6.8,\n    \"bitter\": 1.6,\n    \"umami\": 7.6\n  },\n  \"structural\": {\n    \"body\": 8.2,\n    \"acidity_perception\": 3.1,\n    \"tannin\": 2.0,\n    \"astringency\": 2.3,\n    \"alcohol_heat\": 0.6,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": {\n    \"families\": {\n      \"fruit\": 3.4,\n      \"floral\": 1.1,\n      \"herbal\": 1.7,\n      \"spice\": 2.6,\n      \"earth\": 3.9,\n      \"wood\": 3.2,\n      \"dairy\": 6.5,\n      \"savory\": 7.2,\n      \"confection\": 0.6,\n      \"vegetal\": 1.3,\n      \"marine\": 0.2\n    },\n    \"qualifiers\": {\n      \"fresh\": 1.0,\n      \"dried\": 5.1,\n      \"cooked\": 2.8,\n      \"candied\": 0.4,\n      \"fermented\": 4.6,\n      \"zest\": 0.8,\n      \"overripe\": 2.4\n    }\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.umami": {
                "predicted": 7.6,
                "expected": 8.0,
                "match": true
              },
              "basic_tastes.salty": {
                "predicted": 6.8,
                "expected": 5.0,
                "match": true
              },
              "structural.body": {
                "predicted": 8.2,
                "expected": 7.0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 453,
            "output_tokens": 336,
            "latency_ms": 2782.5,
            "usd": 0.000157,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 453.0,
        "output_tokens_mean": 330.667,
        "latency_ms_mean": 2666.533,
        "usd_sum": 0.000465
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-019",
      "category": "structured_output",
      "capability": "sensory_profile_json",
      "difficulty": "hard",
      "cuisine_region": "japanese",
      "scorer": "json_match",
      "context_needs": [
        "scales.basic_tastes",
        "scales.structural",
        "vocab.aromatic_families",
        "vocab.qualifiers"
      ],
      "prompt_template_hash": "aa2726470aa7cfb8",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Aromatic qualifiers\n\nfresh, dried, cooked, candied, fermented, zest, overripe\n\n---\n\nProduce a JSON sensory profile for blistered shishito peppers lightly salted. Keys: basic_tastes, structural, aromatics.\n\nINPUT:\n{\n  \"item\": \"blistered shishito peppers with flaky salt\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "value": {
          "basic_tastes": {
            "sweet": 2.0,
            "sour": 1.0,
            "salty": 5.0,
            "bitter": 3.0,
            "umami": 2.0
          },
          "structural": {
            "body": 3.0,
            "acidity_perception": 2.0,
            "tannin": 0,
            "astringency": 1.0,
            "alcohol_heat": 0,
            "effervescence": 0
          }
        },
        "required_keys": [
          "basic_tastes.salty",
          "basic_tastes.bitter",
          "structural.body"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 0.2,\n    \"sour\": 0,\n    \"salty\": 4.5,\n    \"bitter\": 1.2,\n    \"umami\": 0.8\n  },\n  \"structural\": {\n    \"body\": 2.5,\n    \"acidity_perception\": 0.4,\n    \"tannin\": 0.6,\n    \"astringency\": 0.8,\n    \"alcohol_heat\": 0,\n    \"effervescence\": 0\n  },\n  \"aromatics\": {\n    \"fruit\": {\n      \"intensity\": 0.3,\n      \"qualifiers\": [\n        \"fresh\"\n      ]\n    },\n    \"herbal\": {\n      \"intensity\": 4.6,\n      \"qualifiers\": [\n        \"fresh\",\n        \"cooked\"\n      ]\n    },\n    \"vegetal\": {\n      \"intensity\": 5.2,\n      \"qualifiers\": [\n        \"cooked\",\n        \"fresh\"\n      ]\n    },\n    \"spice\": {\n      \"intensity\": 2.2,\n      \"qualifiers\": [\n        \"cooked\"\n      ]\n    },\n    \"earth\": {\n      \"intensity\": 1.6,\n      \"qualifiers\": [\n        \"cooked\"\n      ]\n    }\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.salty": {
                "predicted": 4.5,
                "expected": 5.0,
                "match": true
              },
              "basic_tastes.bitter": {
                "predicted": 1.2,
                "expected": 3.0,
                "match": true
              },
              "structural.body": {
                "predicted": 2.5,
                "expected": 3.0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 447,
            "output_tokens": 298,
            "latency_ms": 2118.7,
            "usd": 0.000142,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 0.8,\n    \"sour\": 0.2,\n    \"salty\": 6.5,\n    \"bitter\": 2.7,\n    \"umami\": 1.8\n  },\n  \"structural\": {\n    \"body\": 2.2,\n    \"acidity_perception\": 0.3,\n    \"tannin\": 0.6,\n    \"astringency\": 1.4,\n    \"alcohol_heat\": 0,\n    \"effervescence\": 0\n  },\n  \"aromatics\": {\n    \"fruit\": {\n      \"intensity\": 1.1,\n      \"qualifiers\": []\n    },\n    \"floral\": {\n      \"intensity\": 0.3,\n      \"qualifiers\": []\n    },\n    \"herbal\": {\n      \"intensity\": 5.4,\n      \"qualifiers\": [\n        \"fresh\",\n        \"cooked\"\n      ]\n    },\n    \"spice\": {\n      \"intensity\": 2.9,\n      \"qualifiers\": [\n        \"cooked\"\n      ]\n    },\n    \"earth\": {\n      \"intensity\": 2.0,\n      \"qualifiers\": [\n        \"cooked\"\n      ]\n    },\n    \"wood\": {\n      \"intensity\": 0.6,\n      \"qualifiers\": [\n        \"cooked\"\n      ]\n    },\n    \"dairy\": {\n      \"intensity\": 0,\n      \"qualifiers\": []\n    },\n    \"savory\": {\n      \"intensity\": 4.2,\n      \"qualifiers\": [\n        \"cooked\",\n        \"fermented\"\n      ]\n    },\n    \"confection\": {\n      \"intensity\": 0,\n      \"qualifiers\": []\n    },\n    \"vegetal\": {\n      \"intensity\": 6.0,\n      \"qualifiers\": [\n        \"fresh\",\n        \"cooked\"\n      ]\n    },\n    \"marine\": {\n      \"intensity\": 0.1,\n      \"qualifiers\": []\n    }\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.salty": {
                "predicted": 6.5,
                "expected": 5.0,
                "match": true
              },
              "basic_tastes.bitter": {
                "predicted": 2.7,
                "expected": 3.0,
                "match": true
              },
              "structural.body": {
                "predicted": 2.2,
                "expected": 3.0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 447,
            "output_tokens": 451,
            "latency_ms": 3335.6,
            "usd": 0.000203,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 0.8,\n    \"sour\": 0.2,\n    \"salty\": 4.8,\n    \"bitter\": 2.6,\n    \"umami\": 1.4\n  },\n  \"structural\": {\n    \"body\": 2.3,\n    \"acidity_perception\": 0.3,\n    \"tannin\": 0.6,\n    \"astringency\": 1.7,\n    \"alcohol_heat\": 0.0,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": {\n    \"families\": {\n      \"vegetal\": 6.8,\n      \"herbal\": 3.2,\n      \"earth\": 1.9,\n      \"spice\": 1.6\n    },\n    \"qualifiers\": {\n      \"cooked\": 5.9,\n      \"fresh\": 2.8,\n      \"dried\": 0.4,\n      \"zest\": 0.2\n    }\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.salty": {
                "predicted": 4.8,
                "expected": 5.0,
                "match": true
              },
              "basic_tastes.bitter": {
                "predicted": 2.6,
                "expected": 3.0,
                "match": true
              },
              "structural.body": {
                "predicted": 2.3,
                "expected": 3.0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 447,
            "output_tokens": 238,
            "latency_ms": 1876.1,
            "usd": 0.000118,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 447.0,
        "output_tokens_mean": 329.0,
        "latency_ms_mean": 2443.467,
        "usd_sum": 0.000463
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-020",
      "category": "structured_output",
      "capability": "schema_valid_pairing",
      "difficulty": "expert",
      "cuisine_region": "mexican",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.pairing",
        "vocab.match_types",
        "vocab.explanation_codes",
        "vocab.grade_scale",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "5b115600e8dad71f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS pairing schema (`schema/pairing.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/pairing.schema.json\",\n  \"title\": \"OGS Pairing\",\n  \"description\": \"A scored, explained pairing between a dish and a beverage.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"dish_ref\",\n    \"beverage_ref\",\n    \"match_types\",\n    \"overall_score\",\n    \"grade\",\n    \"explanation_codes\",\n    \"confidence\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:pairing:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"dish_ref\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:dish:[a-z0-9-]+$\"\n    },\n    \"beverage_ref\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"match_types\": {\n      \"type\": \"array\",\n      \"minItems\": 1,\n      \"items\": {\n        \"$ref\": \"#/$defs/match_type_entry\"\n      }\n    },\n    \"overall_score\": {\n      \"type\": \"integer\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"grade\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"A+\",\n        \"A\",\n        \"B+\",\n        \"B\",\n        \"C+\",\n        \"C\",\n        \"D\",\n        \"F\"\n      ]\n    },\n    \"explanation_codes\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/explanation_code_entry\"\n      }\n    },\n    \"risks\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/risk_entry\"\n      }\n    },\n    \"confidence\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 1\n    },\n    \"evidence\": {\n      \"$ref\": \"#/$defs/evidence\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"match_type_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"type\",\n        \"score\"\n      ],\n      \"properties\": {\n        \"type\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"harmony\",\n            \"contrast\",\n            \"bridge\",\n            \"intensity_alignment\",\n            \"structural_balance\",\n            \"refresh\"\n          ]\n        },\n        \"score\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 100\n        },\n        \"description\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"primary\": {\n          \"type\": \"boolean\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"explanation_code_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"code\"\n      ],\n      \"properties\": {\n        \"code\": {\n          \"type\": \"string\"\n        },\n        \"weight\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 1\n        },\n        \"detail\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"risk_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"code\",\n        \"severity\"\n      ],\n      \"properties\": {\n        \"code\": {\n          \"type\": \"string\"\n        },\n        \"severity\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"low\",\n            \"medium\",\n            \"high\"\n          ]\n        },\n        \"description\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"mitigation\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"evidence\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"method\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"expert_panel\",\n            \"instrumental\",\n            \"computed\",\n            \"estimated\"\n          ]\n        },\n        \"sources\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n### OGS Explanation Codes\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce an OGS pairing JSON for mole-negro enchiladas with Aragonese Garnacha. JSON only.\n\nINPUT:\n{\n  \"pairing_slug\": \"mole-negro-garnacha\",\n  \"dish_ref\": \"ogs:core:dish:mole-negro-enchiladas\",\n  \"beverage_ref\": \"ogs:core:beverage:garnacha-aragon\",\n  \"intent\": \"Cross-cultural bridge pairing: ripe fruit and body handle chocolate-chile sauce.\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/pairing.schema.json",
        "required_fields": {
          "id": "ogs:core:pairing:mole-negro-garnacha",
          "ogs_version": "0.1.0",
          "dish_ref": "ogs:core:dish:mole-negro-enchiladas",
          "beverage_ref": "ogs:core:beverage:garnacha-aragon"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:mole-negro-enchiladas-garnacha-aragon\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": {\n    \"en\": \"Mole Negro Enchiladas with Aragonese Garnacha\"\n  },\n  \"dish_ref\": \"ogs:core:dish:mole-negro-enchiladas\",\n  \"beverage_ref\": \"ogs:core:beverage:garnacha-aragon\",\n  \"match_types\": [\n    {\n      \"type\": \"bridge\",\n      \"score\": 78,\n      \"description\": {\n        \"en\": \"Ripe fruit and garnacha-derived warmth provides an aromatic and flavor bridge to mole’s chocolate-chile-spice profile.\"\n      },\n      \"primary\": true\n    },\n    {\n      \"type\": \"harmony\",\n      \"score\": 75,\n      \"description\": {\n        \"en\": \"Overall flavor profiles align through complementary fruit, spice warmth, and savory depth.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 72,\n      \"description\": {\n        \"en\": \"Garnacha’s body and alcohol warmth generally match the richness of the mole and help integrate the sauce with the dish’s savory proteins.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"contrast\",\n      \"score\": 64,\n      \"description\": {\n        \"en\": \"Sweetness and spice tension is managed by the wine’s acidity, creating lift against the sauce’s density.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"intensity_alignment\",\n      \"score\": 70,\n      \"description\": {\n        \"en\": \"Neither element is expected to overwhelm the other when the garnacha has adequate fruit and the mole is balanced (not excessively bitter).\"\n      },\n      \"primary\": false\n    }\n  ],\n  \"overall_score\": 82,\n  \"grade\": \"B+\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"AROMA_SHARED_SPICE\",\n      \"weight\": 0.35,\n      \"detail\": {\n        \"en\": \"Chocolate-chile/spice notes in mole echo the warm spice aromatics commonly present in garnacha.\"\n      }\n    },\n    {\n      \"code\": \"HARMONY_FLAVOR_CONVERGE\",\n      \"weight\": 0.25,\n      \"detail\": {\n        \"en\": \"Ripe fruit and savory spice depth converge with mole’s dark, complex sauce character.\"\n      }\n    },\n    {\n      \"code\": \"STRUCT_BODY_ALIGN\",\n      \"weight\": 0.25,\n      \"detail\": {\n        \"en\": \"The dish’s rich mole weight is supported by the wine’s body for a more even mouthfeel.\"\n      }\n    },\n    {\n      \"code\": \"STRUCT_ALCOHOL_WARMTH\",\n      \"weight\": 0.15,\n      \"detail\": {\n        \"en\": \"Wine warmth helps complement the sauce’s chili heat without harshness when moderate.\"\n      }\n    }\n  ],\n  \"risks\": [\n    {\n      \"code\": \"RISK_INTENSITY_MISMATCH\",\n      \"severity\": \"medium\",\n      \"description\": {\n        \"en\": \"Very aggressive, bitter, or heavily spiced mole can push past garnacha’s fruit-spice balance.\"\n      },\n      \"mitigation\": {\n        \"en\": \"Serve slightly cooler or choose a garnacha with fuller ripe fruit and softer tannins; ensure the mole’s bitterness is not dominant.\"\n      }\n    },\n    {\n      \"code\": \"RISK_SPICE_ALCOHOL\",\n      \"severity\": \"low\",\n      \"description\": {\n        \"en\": \"If the chili heat is high and the wine is high-ABV, alcohol can heighten perceived heat.\"\n      },\n      \"mitigation\": {\n        \"en\": \"Pick a garnacha with moderate alcohol and lean into accompaniments that cool (e.g., crema/pickled sides).\"\n      }\n    }\n  ],\n  \"confidence\": 0.68,\n  \"evidence\": {\n    \"method\": \"estimated\",\n    \"sources\": [\n      \"Flavor-pairing heuristics: dark chocolate-chile/spice compatibility with ripe, fruit-forward red wines\"\n    ],\n    \"notes\": {\n      \"en\": \"Assumes mole-negro is balanced (chocolate-cocoa notes plus chile warmth) and the garnacha is ripe with moderate tannins and sufficient acidity.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"For best results, aim for a garnacha with ripe fruit, gentle tannins, and no overly dominant oak; mole’s complexity should be met by matching richness and warmth in the wine.\"\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"ogs-benchmark-generated\"\n  }\n}",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:pairing:mole-negro-enchiladas-garnacha-aragon",
                "expected": "ogs:core:pairing:mole-negro-garnacha",
                "match": false
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:mole-negro-enchiladas",
                "expected": "ogs:core:dish:mole-negro-enchiladas",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:garnacha-aragon",
                "expected": "ogs:core:beverage:garnacha-aragon",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3371,
            "output_tokens": 1059,
            "latency_ms": 6331.2,
            "usd": 0.000592,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:mole-negro-garnacha\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": {\n    \"en\": \"Mole negro enchiladas with Aragonese Garnacha\"\n  },\n  \"dish_ref\": \"ogs:core:dish:mole-negro-enchiladas\",\n  \"beverage_ref\": \"ogs:core:beverage:garnacha-aragon\",\n  \"match_types\": [\n    {\n      \"type\": \"bridge\",\n      \"score\": 86,\n      \"description\": {\n        \"en\": \"Ripe fruit notes and a wine-like aromatic body help connect to mole’s chocolate-chile complexity.\"\n      },\n      \"primary\": true\n    },\n    {\n      \"type\": \"harmony\",\n      \"score\": 80,\n      \"description\": {\n        \"en\": \"Shared warm, roasted-spice and dark-fruit direction creates overall flavor convergence.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 78,\n      \"description\": {\n        \"en\": \"Garnacha body and moderate tannin integrate with mole’s richness while the acidity keeps the bite from feeling heavy.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"intensity_alignment\",\n      \"score\": 75,\n      \"description\": {\n        \"en\": \"Wine strength generally aligns with mole’s boldness, though very spicy batches could run ahead.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"refresh\",\n      \"score\": 64,\n      \"description\": {\n        \"en\": \"Some acidity refreshes the palate between bites, but effervescence is not present.\"\n      },\n      \"primary\": false\n    }\n  ],\n  \"overall_score\": 83,\n  \"grade\": \"B+\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"AROMA_SHARED_SPICE\",\n      \"weight\": 0.35,\n      \"detail\": {\n        \"en\": \"Mole’s chile-spice aromatics are echoed by the wine’s warm spice and fruit-driven aromatics.\"\n      }\n    },\n    {\n      \"code\": \"AROMA_SHARED_EARTH\",\n      \"weight\": 0.25,\n      \"detail\": {\n        \"en\": \"Earthy/roasted depth in mole aligns with earthy wine undertones that support chocolate notes.\"\n      }\n    },\n    {\n      \"code\": \"CONTRAST_SWEET_ACID\",\n      \"weight\": 0.15,\n      \"detail\": {\n        \"en\": \"Mole’s perceived sweetness from chocolate/nuts benefits from the wine’s acidity to keep flavors from feeling cloying.\"\n      }\n    },\n    {\n      \"code\": \"STRUCT_BODY_ALIGN\",\n      \"weight\": 0.15,\n      \"detail\": {\n        \"en\": \"The wine’s body matches the dish’s richness and viscosity, improving mouthfeel balance.\"\n      }\n    },\n    {\n      \"code\": \"STRUCT_ACIDITY_BALANCE\",\n      \"weight\": 0.1,\n      \"detail\": {\n        \"en\": \"Acidity helps cut through fat and chocolate-chile sauce density.\"\n      }\n    }\n  ],\n  \"confidence\": 0.68,\n  \"risks\": [\n    {\n      \"code\": \"RISK_INTENSITY_MISMATCH\",\n      \"severity\": \"medium\",\n      \"description\": {\n        \"en\": \"If the enchiladas are extremely smoky or very hot, the wine’s fruit may not fully keep up with the sauce intensity.\"\n      },\n      \"mitigation\": {\n        \"en\": \"Serve slightly cooler and pair with garnishes that temper heat (e.g., crema/cheese or milder salsa) if available.\"\n      }\n    }\n  ],\n  \"evidence\": {\n    \"method\": \"estimated\",\n    \"sources\": [\n      \"Typical flavor/structure profiles of mole negro (chocolate-chile, roasted spice, earthy notes)\"\n    ],\n    \"notes\": {\n      \"en\": \"Aragonese Garnacha is modeled as a ripe, medium-to-full-bodied red with warm spice and fruit, with enough acidity to balance mole’s richness.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Aim for a Garnacha showing ripe red/black fruit and savory spice rather than very oaky styles, so the chocolate-chile character stays in focus.\"\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"ogs-benchmark-generated\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:pairing:mole-negro-garnacha",
                "expected": "ogs:core:pairing:mole-negro-garnacha",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:mole-negro-enchiladas",
                "expected": "ogs:core:dish:mole-negro-enchiladas",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:garnacha-aragon",
                "expected": "ogs:core:beverage:garnacha-aragon",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3371,
            "output_tokens": 994,
            "latency_ms": 6398.1,
            "usd": 0.000566,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:cross-cultural:pairing:mole-negro-garnacha\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": {\n    \"en\": \"Mole Negro Enchiladas with Aragonese Garnacha\"\n  },\n  \"description\": {\n    \"en\": \"A cross-cultural bridge pairing where the ripe fruit body of Garnacha supports mole negro’s chocolate and chile depth.\"\n  },\n  \"dish_ref\": \"ogs:core:dish:mole-negro-enchiladas\",\n  \"beverage_ref\": \"ogs:core:beverage:garnacha-aragon\",\n  \"match_types\": [\n    {\n      \"type\": \"bridge\",\n      \"score\": 88,\n      \"description\": {\n        \"en\": \"Ripe-fruit and spice notes from the Garnacha echo the mole’s chocolate-chile character, creating continuity.\"\n      },\n      \"primary\": true\n    },\n    {\n      \"type\": \"harmony\",\n      \"score\": 82,\n      \"description\": {\n        \"en\": \"Flavor profiles converge around cocoa/roast and dried-fruit tones with complementary savory elements.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"intensity_alignment\",\n      \"score\": 78,\n      \"description\": {\n        \"en\": \"The wine’s medium-to-full body is generally well-matched to the richness of the enchiladas.\"\n      },\n      \"primary\": false\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 74,\n      \"description\": {\n        \"en\": \"Tannin-friendliness with fat and protein in the dish helps keep mouthfeel smooth without excessive bite.\"\n      },\n      \"primary\": false\n    }\n  ],\n  \"overall_score\": 84,\n  \"grade\": \"B\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"AROMA_SHARED_SPICE\",\n      \"weight\": 0.35,\n      \"detail\": {\n        \"en\": \"Chile/spice aromatics in mole connect with spice/warm-fruit lift in Garnacha.\"\n      }\n    },\n    {\n      \"code\": \"HARMONY_FLAVOR_CONVERGE\",\n      \"weight\": 0.35,\n      \"detail\": {\n        \"en\": \"Roasted/chocolate depth of mole aligns with the wine’s ripe fruit and savory-bitter notes.\"\n      }\n    },\n    {\n      \"code\": \"STRUCT_TANNIN_FAT\",\n      \"weight\": 0.2,\n      \"detail\": {\n        \"en\": \"Tannins are softened by the dish’s fat and sauce richness, improving integration.\"\n      }\n    },\n    {\n      \"code\": \"STRUCT_BODY_ALIGN\",\n      \"weight\": 0.1,\n      \"detail\": {\n        \"en\": \"Fuller-body Garnacha matches the weight of mole negro enchiladas.\"\n      }\n    }\n  ],\n  \"risks\": [\n    {\n      \"code\": \"RISK_SPICE_ALCOHOL\",\n      \"severity\": \"medium\",\n      \"description\": {\n        \"en\": \"If the mole is very hot, alcohol can amplify perceived heat.\"\n      },\n      \"mitigation\": {\n        \"en\": \"Serve slightly cooler and ensure the dish has enough richness/crema balance to round the chile.\"\n      }\n    },\n    {\n      \"code\": \"RISK_INTENSITY_MISMATCH\",\n      \"severity\": \"low\",\n      \"description\": {\n        \"en\": \"Very assertive, smoky mole can slightly outshine the wine’s fruit.\"\n      },\n      \"mitigation\": {\n        \"en\": \"Choose a Garnacha with enough roast/savory character rather than a very light style.\"\n      }\n    }\n  ],\n  \"confidence\": 0.68,\n  \"evidence\": {\n    \"method\": \"expert_panel\",\n    \"sources\": [\n      \"OGS controlled pairing heuristics for roast/chocolate-spice with ripe, medium-full red wines\"\n    ],\n    \"notes\": {\n      \"en\": \"Assessment emphasizes mole negro’s cocoa-chile profile and Garnacha’s ripe fruit body as a cross-cultural aromatic bridge.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Best with enchiladas where mole negro is balanced (roast/cocoa-forward, chile integrated).\"\n  },\n  \"metadata\": {\n    \"created\": \"2026-04-18T00:00:00Z\",\n    \"updated\": \"2026-04-18T00:00:00Z\",\n    \"source\": \"ogs-benchmark-generation\",\n    \"license\": \"unspecified\"\n  }\n}",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": "ogs:cross-cultural:pairing:mole-negro-garnacha",
                "expected": "ogs:core:pairing:mole-negro-garnacha",
                "match": false
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:mole-negro-enchiladas",
                "expected": "ogs:core:dish:mole-negro-enchiladas",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:garnacha-aragon",
                "expected": "ogs:core:beverage:garnacha-aragon",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3371,
            "output_tokens": 972,
            "latency_ms": 6502.7,
            "usd": 0.000557,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.2887,
        "sem": 0.1667,
        "ci95_lo": 0.34,
        "ci95_hi": 0.9933,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3371.0,
        "output_tokens_mean": 1008.333,
        "latency_ms_mean": 6410.667,
        "usd_sum": 0.001715
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-001",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "e52270d817a9dc7f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nUsing the OGS sensory model, return the single aromatic family (from the controlled vocabulary) that best classifies the aromatic item below. Respond with just the family name.\n\nINPUT:\n{\n  \"item\": \"lemon zest\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "fruit"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "fruit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fruit",
            "expected": "fruit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 332,
            "output_tokens": 4,
            "latency_ms": 510.4,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "fruit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fruit",
            "expected": "fruit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 332,
            "output_tokens": 4,
            "latency_ms": 573.5,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "fruit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fruit",
            "expected": "fruit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 332,
            "output_tokens": 4,
            "latency_ms": 659.7,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 332.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 581.2,
        "usd_sum": 5.4e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-002",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "e2096242953ff882",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nUsing the OGS sensory model, return the single aromatic family that best classifies the item below. Respond with just the family name.\n\nINPUT:\n{\n  \"item\": \"wet stone\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "earth"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "earth",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "earth",
            "expected": "earth",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 324,
            "output_tokens": 4,
            "latency_ms": 490.0,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "earth",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "earth",
            "expected": "earth",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 324,
            "output_tokens": 4,
            "latency_ms": 603.2,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "earth",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "earth",
            "expected": "earth",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 324,
            "output_tokens": 4,
            "latency_ms": 614.8,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 324.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 569.333,
        "usd_sum": 5.4e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-003",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "fa5fa32bfe275dfb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nUsing the OGS sensory model, return the single aromatic family that best classifies the item below. Respond with just the family name.\n\nINPUT:\n{\n  \"item\": \"cooked beef fat\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "savory"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "savory",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "savory",
            "expected": "savory",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 326,
            "output_tokens": 5,
            "latency_ms": 739.9,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "savory",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "savory",
            "expected": "savory",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 326,
            "output_tokens": 5,
            "latency_ms": 596.0,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "savory",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "savory",
            "expected": "savory",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 326,
            "output_tokens": 5,
            "latency_ms": 928.2,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 326.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 754.7,
        "usd_sum": 5.4e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-004",
      "category": "vocabulary",
      "capability": "cooking_method_vocabulary",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "e0631034b8b8ece4",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nPick the single OGS cooking method that best describes a protein slowly cooked while fully submerged in fat at a low temperature. Respond with just the method name from the OGS vocabulary.\n\nINPUT:\n{\n  \"description\": \"duck legs slow-cooked submerged in their own rendered fat at 80 C\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "confit"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "confit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "confit",
            "expected": "confit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 406,
            "output_tokens": 5,
            "latency_ms": 583.2,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "confit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "confit",
            "expected": "confit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 406,
            "output_tokens": 5,
            "latency_ms": 737.6,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "confit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "confit",
            "expected": "confit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 406,
            "output_tokens": 5,
            "latency_ms": 602.6,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 406.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 641.133,
        "usd_sum": 6.6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-005",
      "category": "vocabulary",
      "capability": "match_type_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "bbbd81bb3c8b8ea4",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nSelect the single OGS match type that best describes a pairing whose strength is driven by a specific shared aromatic compound or family between dish and beverage.\n\nINPUT:\n{\n  \"description\": \"dish and wine are linked by a shared earthy, forest-floor aromatic\"\n}\n\nANSWER FORMAT: one_of:harmony,contrast,bridge,intensity_alignment,structural_balance,refresh",
      "expected": {
        "value": "bridge"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "harmony",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "harmony",
            "expected": "bridge",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 316,
            "output_tokens": 5,
            "latency_ms": 695.7,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "harmony",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "harmony",
            "expected": "bridge",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 316,
            "output_tokens": 5,
            "latency_ms": 677.1,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "harmony",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "harmony",
            "expected": "bridge",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 316,
            "output_tokens": 5,
            "latency_ms": 551.0,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 316.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 641.267,
        "usd_sum": 5.4e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-006",
      "category": "vocabulary",
      "capability": "component_role_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "6c6ae6f82f354be3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nReturn the single OGS component role that best fits the described element. Use only roles from the OGS controlled vocabulary.\n\nINPUT:\n{\n  \"element\": \"a soy-dashi based liquid ladled over the rice bowl, providing salt, umami and moisture\"\n}\n\nANSWER FORMAT: one_of:main_protein,main_vegetable,starch,sauce,fat_component,acid_component,aromatic_garnish,seasoning,textural_element",
      "expected": {
        "value": "sauce"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "sauce",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sauce",
            "expected": "sauce",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 350,
            "output_tokens": 5,
            "latency_ms": 529.8,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "sauce",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sauce",
            "expected": "sauce",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 350,
            "output_tokens": 5,
            "latency_ms": 567.6,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "sauce",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sauce",
            "expected": "sauce",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 350,
            "output_tokens": 5,
            "latency_ms": 646.3,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 350.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 581.233,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-007",
      "category": "vocabulary",
      "capability": "qualifier_vocabulary",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.qualifiers"
      ],
      "prompt_template_hash": "0adac8c06022acf5",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic qualifiers\n\nfresh, dried, cooked, candied, fermented, zest, overripe\n\n---\n\nSelect the OGS aromatic qualifier that best describes how the item is expressed.\n\nINPUT:\n{\n  \"item\": \"candied ginger\"\n}\n\nANSWER FORMAT: one_of:fresh,dried,cooked,candied,fermented,zest,overripe",
      "expected": {
        "value": "candied"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "candied",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "candied",
            "expected": "candied",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 298,
            "output_tokens": 5,
            "latency_ms": 667.6,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "candied",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "candied",
            "expected": "candied",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 298,
            "output_tokens": 5,
            "latency_ms": 610.3,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "candied",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "candied",
            "expected": "candied",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 298,
            "output_tokens": 5,
            "latency_ms": 521.3,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 298.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 599.733,
        "usd_sum": 5.1e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-008",
      "category": "vocabulary",
      "capability": "grade_scale",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.grade_scale"
      ],
      "prompt_template_hash": "e58319dc1276cdb2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n---\n\nUsing the OGS pairing grade scale (ogs-match.md §5), return the letter grade for the given overall score. Respond with just the grade token, e.g. 'A+'.\n\nINPUT:\n{\n  \"overall_score\": 92\n}\n\nANSWER FORMAT: one_of:A+,A,B+,B,C+,C,D,F",
      "expected": {
        "value": "A"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "A",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "A",
            "expected": "A",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 385,
            "output_tokens": 4,
            "latency_ms": 610.0,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "A",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "A",
            "expected": "A",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 385,
            "output_tokens": 4,
            "latency_ms": 505.9,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "A",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "A",
            "expected": "A",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 385,
            "output_tokens": 4,
            "latency_ms": 672.9,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 385.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 596.267,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-009",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "easy",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "d51864ab27ecfa64",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nUsing the OGS sensory model, return the single aromatic family that best characterises smoked paprika. Respond with just the family name.\n\nINPUT:\n{\n  \"item\": \"smoked paprika (pimentón de la Vera)\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "spice"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "spice",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "spice",
            "expected": "spice",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 331,
            "output_tokens": 5,
            "latency_ms": 556.9,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "spice",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "spice",
            "expected": "spice",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 331,
            "output_tokens": 5,
            "latency_ms": 517.5,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "spice",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "spice",
            "expected": "spice",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 331,
            "output_tokens": 5,
            "latency_ms": 597.1,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 331.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 557.167,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-010",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "1748069ad0ebdde7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nUsing the OGS sensory model, return the single dominant aromatic family of a classic kombu-katsuo dashi.\n\nINPUT:\n{\n  \"item\": \"classic kombu-katsuobushi dashi\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "marine"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "marine",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "marine",
            "expected": "marine",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 328,
            "output_tokens": 4,
            "latency_ms": 603.1,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "marine",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "marine",
            "expected": "marine",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 328,
            "output_tokens": 4,
            "latency_ms": 584.9,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "marine",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "marine",
            "expected": "marine",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 328,
            "output_tokens": 4,
            "latency_ms": 636.4,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 328.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 608.133,
        "usd_sum": 5.4e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-011",
      "category": "vocabulary",
      "capability": "cooking_method_vocabulary",
      "difficulty": "easy",
      "cuisine_region": "indian",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "85de6add5c1ebfba",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nPick the single OGS cooking method that best matches tandoori chicken cooked at 450°C in a clay oven.\n\nINPUT:\n{\n  \"description\": \"tandoori chicken, marinated and cooked in a clay tandoor at ~450°C\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "roasted"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "charred",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "charred",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 398,
            "output_tokens": 5,
            "latency_ms": 549.1,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "baked",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "baked",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 398,
            "output_tokens": 5,
            "latency_ms": 620.5,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "baked",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "baked",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 398,
            "output_tokens": 5,
            "latency_ms": 649.4,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 398.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 606.333,
        "usd_sum": 6.6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-012",
      "category": "vocabulary",
      "capability": "cooking_method_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "peruvian",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "44c9b26dc70c4557",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nPick the single OGS cooking method that best describes Peruvian ceviche's treatment of white fish in lime juice.\n\nINPUT:\n{\n  \"description\": \"diced white fish denatured in freshly squeezed lime juice for 10 minutes with red onion and aji limo\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "cured"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "cured",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cured",
            "expected": "cured",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 401,
            "output_tokens": 5,
            "latency_ms": 621.1,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "pickled",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "pickled",
            "expected": "cured",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 401,
            "output_tokens": 5,
            "latency_ms": 554.6,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "pickled",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "pickled",
            "expected": "cured",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 401,
            "output_tokens": 5,
            "latency_ms": 753.1,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.3333,
      "score_summary": {
        "mean": 0.3333,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": -0.32,
        "ci95_hi": 0.9867,
        "n": 3
      },
      "score_if_valid": 0.3333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 401.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 642.933,
        "usd_sum": 6.6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-013",
      "category": "vocabulary",
      "capability": "component_role_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "moroccan",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "dc3e62daf8bdf21a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nReturn the single OGS component role that best fits harissa stirred into a lamb tagine at service.\n\nINPUT:\n{\n  \"description\": \"a spoonful of fiery harissa stirred into a lamb-and-apricot tagine at service\"\n}\n\nANSWER FORMAT: one_of:main_protein,main_vegetable,starch,sauce,fat_component,acid_component,aromatic_garnish,seasoning,textural_element",
      "expected": {
        "value": "seasoning"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "sauce",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "sauce",
            "expected": "seasoning",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 346,
            "output_tokens": 5,
            "latency_ms": 614.8,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "seasoning",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "seasoning",
            "expected": "seasoning",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 346,
            "output_tokens": 5,
            "latency_ms": 559.6,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "sauce",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "sauce",
            "expected": "seasoning",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 346,
            "output_tokens": 5,
            "latency_ms": 572.7,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.3333,
      "score_summary": {
        "mean": 0.3333,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": -0.32,
        "ci95_hi": 0.9867,
        "n": 3
      },
      "score_if_valid": 0.3333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 346.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 582.367,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-014",
      "category": "vocabulary",
      "capability": "qualifier_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "chinese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.qualifiers"
      ],
      "prompt_template_hash": "3980456a47e9b7c6",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic qualifiers\n\nfresh, dried, cooked, candied, fermented, zest, overripe\n\n---\n\nSelect the OGS aromatic qualifier that best describes the state of Sichuan peppercorn as it is commonly used — whole, toasted, and ground over finished mapo tofu.\n\nINPUT:\n{\n  \"item\": \"Sichuan peppercorn (Zanthoxylum), whole, pan-toasted just before grinding over the dish\"\n}\n\nANSWER FORMAT: one_of:fresh,dried,cooked,candied,fermented,zest,overripe",
      "expected": {
        "value": "dried"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "cooked",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "cooked",
            "expected": "dried",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 336,
            "output_tokens": 5,
            "latency_ms": 626.7,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "cooked",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "cooked",
            "expected": "dried",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 336,
            "output_tokens": 5,
            "latency_ms": 697.6,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "cooked",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "cooked",
            "expected": "dried",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 336,
            "output_tokens": 5,
            "latency_ms": 625.9,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 336.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 650.067,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-015",
      "category": "vocabulary",
      "capability": "grade_scale",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.grade_scale"
      ],
      "prompt_template_hash": "ad313a98974d64f7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n---\n\nUsing the OGS pairing grade scale, return the letter grade for a pairing whose `overall_score` is 78.\n\nINPUT:\n{\n  \"overall_score\": 78\n}\n\nANSWER FORMAT: one_of:A+,A,B+,B,C+,C,D,F",
      "expected": {
        "value": "C+"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "C+",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "c+",
            "expected": "c+",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 371,
            "output_tokens": 5,
            "latency_ms": 534.1,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "C+",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "c+",
            "expected": "c+",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 371,
            "output_tokens": 5,
            "latency_ms": 570.7,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "C+",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "c+",
            "expected": "c+",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 371,
            "output_tokens": 5,
            "latency_ms": 563.7,
            "usd": 2.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 371.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 556.167,
        "usd_sum": 6.3e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-016",
      "category": "vocabulary",
      "capability": "match_type_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "british",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "95e41ab82cba029a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nSelect the single OGS match type that best describes a beverage whose carbonation physically scrubs the palate between bites of a rich fried dish.\n\nINPUT:\n{\n  \"description\": \"Champagne cleanses the palate between bites of fish and chips\"\n}\n\nANSWER FORMAT: one_of:harmony,contrast,bridge,intensity_alignment,structural_balance,refresh",
      "expected": {
        "value": "refresh"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "refresh",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "refresh",
            "expected": "refresh",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 313,
            "output_tokens": 4,
            "latency_ms": 593.3,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "refresh",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "refresh",
            "expected": "refresh",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 313,
            "output_tokens": 4,
            "latency_ms": 620.9,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "refresh",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "refresh",
            "expected": "refresh",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 313,
            "output_tokens": 4,
            "latency_ms": 504.0,
            "usd": 1.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 313.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 572.733,
        "usd_sum": 5.1e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-017",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "8070fb687fd2f108",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nUsing the OGS sensory model, return the single dominant aromatic family of fresh cilantro leaves.\n\nINPUT:\n{\n  \"item\": \"fresh cilantro / coriander leaves\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "herbal"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "herbal",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herbal",
            "expected": "herbal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 318,
            "output_tokens": 5,
            "latency_ms": 508.1,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "herbal",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herbal",
            "expected": "herbal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 318,
            "output_tokens": 5,
            "latency_ms": 1083.3,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "herbal",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herbal",
            "expected": "herbal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 318,
            "output_tokens": 5,
            "latency_ms": 556.3,
            "usd": 1.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 318.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 715.9,
        "usd_sum": 5.4e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-018",
      "category": "vocabulary",
      "capability": "component_role_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "levantine",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "9e97b59361dbdf0e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nReturn the single OGS component role that best fits a drizzle of pomegranate molasses dressed onto a fattoush salad.\n\nINPUT:\n{\n  \"description\": \"pomegranate molasses drizzled over a fattoush salad at service\"\n}\n\nANSWER FORMAT: one_of:main_protein,main_vegetable,starch,sauce,fat_component,acid_component,aromatic_garnish,seasoning,textural_element",
      "expected": {
        "value": "acid_component"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "sauce",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "sauce",
            "expected": "acid_component",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 346,
            "output_tokens": 5,
            "latency_ms": 540.3,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "sauce",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "sauce",
            "expected": "acid_component",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 346,
            "output_tokens": 5,
            "latency_ms": 676.8,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "sauce",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "sauce",
            "expected": "acid_component",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 346,
            "output_tokens": 5,
            "latency_ms": 524.1,
            "usd": 1.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 346.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 580.4,
        "usd_sum": 5.7e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-019",
      "category": "vocabulary",
      "capability": "cooking_method_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "filipino",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "fc9a763682f1225b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nPick the single OGS cooking method that best describes bagoong alamang (Philippine fermented salt-cured shrimp paste).\n\nINPUT:\n{\n  \"description\": \"bagoong alamang: salt-cured tiny shrimp left to ferment for 3-6 months\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "fermented"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "fermented",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fermented",
            "expected": "fermented",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 400,
            "output_tokens": 5,
            "latency_ms": 602.2,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "fermented",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fermented",
            "expected": "fermented",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 400,
            "output_tokens": 5,
            "latency_ms": 500.3,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "fermented",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fermented",
            "expected": "fermented",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 400,
            "output_tokens": 5,
            "latency_ms": 635.1,
            "usd": 2.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 400.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 579.2,
        "usd_sum": 6.6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-020",
      "category": "vocabulary",
      "capability": "grade_scale",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.grade_scale"
      ],
      "prompt_template_hash": "56ad4f9c58bc1eb7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n---\n\nUsing the OGS pairing grade scale, return the letter grade for a pairing whose `overall_score` is 64.\n\nINPUT:\n{\n  \"overall_score\": 64\n}\n\nANSWER FORMAT: one_of:A+,A,B+,B,C+,C,D,F",
      "expected": {
        "value": "D"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "D",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "d",
            "expected": "d",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 371,
            "output_tokens": 4,
            "latency_ms": 1002.6,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "D",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "d",
            "expected": "d",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 371,
            "output_tokens": 4,
            "latency_ms": 592.0,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "D",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "d",
            "expected": "d",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 371,
            "output_tokens": 4,
            "latency_ms": 557.0,
            "usd": 2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 371.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 717.2,
        "usd_sum": 6e-05
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-001",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "a75ad0ee2a385794",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nFrom the OGS explanation-code vocabulary, return the set of codes that best explain WHY the pairing below is excellent. Respond as a JSON array of code IDs. Pick 3 to 6 most relevant positive codes.\n\nINPUT:\n{\n  \"dish\": \"A5 wagyu with black truffle jus (very rich, high umami, earthy aromatics, full body)\",\n  \"beverage\": \"Barolo 2018 (Nebbiolo, high tannin, high acidity, full body, tar/forest-floor)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_TANNIN_FAT",
          "STRUCT_INTENSITY_MATCH",
          "AROMA_SHARED_EARTH",
          "STRUCT_BODY_ALIGN",
          "HARMONY_FLAVOR_CONVERGE"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_INTENSITY_MATCH\",\"STRUCT_ACIDITY_BALANCE\",\"AROMA_SHARED_EARTH\",\"STRUCT_UMAMI_ENHANCE\",\"STRUCT_BODY_ALIGN\"]",
          "compliant": true,
          "score": 0.5455,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_PROTEIN",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "expected": [
              "AROMA_SHARED_EARTH",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_FAT"
            ],
            "precision": 0.5,
            "recall": 0.6
          },
          "metrics": {
            "input_tokens": 1158,
            "output_tokens": 44,
            "latency_ms": 817.7,
            "usd": 7.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_UMAMI_ENHANCE\",\"STRUCT_ACIDITY_BALANCE\",\"aromatic_bridge.AROMA_SHARED_EARTH\",\"positive_structural.STRUCT_ALCOHOL_WARMTH\"]",
          "compliant": true,
          "score": 0.1818,
          "detail": {
            "predicted": [
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN",
              "STRUCT_UMAMI_ENHANCE",
              "aromatic_bridge.AROMA_SHARED_EARTH",
              "positive_structural.STRUCT_ALCOHOL_WARMTH"
            ],
            "expected": [
              "AROMA_SHARED_EARTH",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_FAT"
            ],
            "precision": 0.1667,
            "recall": 0.2
          },
          "metrics": {
            "input_tokens": 1158,
            "output_tokens": 56,
            "latency_ms": 752.0,
            "usd": 8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_TANNIN_FAT\",\"STRUCT_ACIDITY_BALANCE\",\"AROMA_SHARED_EARTH\",\"STRUCT_UMAMI_ENHANCE\"]",
          "compliant": true,
          "score": 0.4,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "expected": [
              "AROMA_SHARED_EARTH",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_FAT"
            ],
            "precision": 0.4,
            "recall": 0.4
          },
          "metrics": {
            "input_tokens": 1158,
            "output_tokens": 41,
            "latency_ms": 710.8,
            "usd": 7.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.3758,
      "score_summary": {
        "mean": 0.3758,
        "stdev": 0.1831,
        "sem": 0.1057,
        "ci95_lo": 0.1686,
        "ci95_hi": 0.5829,
        "n": 3
      },
      "score_if_valid": 0.3758,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1158.0,
        "output_tokens_mean": 47.0,
        "latency_ms_mean": 760.167,
        "usd_sum": 0.00023
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-002",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "bdc5d8d74c85e9ab",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS explanation codes that best explain the success of the pairing below. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"fresh oysters on the half shell with lemon\",\n  \"beverage\": \"Chablis Premier Cru (Chardonnay, high acid, mineral, lean body)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_ACIDITY_BALANCE",
          "REFRESH_ACID",
          "REFRESH_MINERAL",
          "CONTRAST_RICH_CRISP",
          "HARMONY_REGIONAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_ACIDITY_BALANCE\",\"REFRESH_ACID\",\"REFRESH_MINERAL\",\"HARMONY_FLAVOR_CONVERGE\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "HARMONY_FLAVOR_CONVERGE",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "precision": 0.75,
            "recall": 0.6
          },
          "metrics": {
            "input_tokens": 1123,
            "output_tokens": 30,
            "latency_ms": 814.3,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"REFRESH_ACID\",\"REFRESH_MINERAL\",\"STRUCT_TANNIN_FAT\",\"STRUCT_BODY_ALIGN\"]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "precision": 0.5,
            "recall": 0.4
          },
          "metrics": {
            "input_tokens": 1123,
            "output_tokens": 25,
            "latency_ms": 685.9,
            "usd": 6.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"REFRESH_ACID\",\"REFRESH_MINERAL\",\"STRUCT_INTENSITY_MATCH\",\"CONTRAST_TEMP_TEXTURE\"]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "CONTRAST_TEMP_TEXTURE",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_INTENSITY_MATCH"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "precision": 0.5,
            "recall": 0.4
          },
          "metrics": {
            "input_tokens": 1123,
            "output_tokens": 26,
            "latency_ms": 741.7,
            "usd": 6.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.5185,
      "score_summary": {
        "mean": 0.5185,
        "stdev": 0.1283,
        "sem": 0.0741,
        "ci95_lo": 0.3733,
        "ci95_hi": 0.6637,
        "n": 3
      },
      "score_if_valid": 0.5185,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1123.0,
        "output_tokens_mean": 27.0,
        "latency_ms_mean": 747.3,
        "usd_sum": 0.000201
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-003",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "3440bcf3978a1c58",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (2 to 4 codes).\n\nINPUT:\n{\n  \"dish\": \"crispy buttermilk fried chicken\",\n  \"beverage\": \"non-vintage Brut Champagne\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "REFRESH_EFFERVESCE",
          "REFRESH_ACID",
          "CONTRAST_RICH_CRISP"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"refresh_acid\",\"contrast_rich_crisp\",\"positive_structural_tannin_fat\"]",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": [
              "contrast_rich_crisp",
              "positive_structural_tannin_fat",
              "refresh_acid"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "REFRESH_EFFERVESCE"
            ],
            "precision": 0.0,
            "recall": 0.0
          },
          "metrics": {
            "input_tokens": 1110,
            "output_tokens": 22,
            "latency_ms": 609.3,
            "usd": 6.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"REFRESH_EFFERVESCE\",\"STRUCT_TANNIN_FAT\",\"CONTRAST_RICH_CRISP\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "CONTRAST_RICH_CRISP",
              "REFRESH_EFFERVESCE",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "REFRESH_EFFERVESCE"
            ],
            "precision": 0.6667,
            "recall": 0.6667
          },
          "metrics": {
            "input_tokens": 1110,
            "output_tokens": 27,
            "latency_ms": 884.0,
            "usd": 6.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"REFRESH_EFFERVESCE\",\"STRUCT_ACIDITY_BALANCE\",\"STRUCT_TANNIN_FAT\"]",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": [
              "REFRESH_EFFERVESCE",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "REFRESH_EFFERVESCE"
            ],
            "precision": 0.3333,
            "recall": 0.3333
          },
          "metrics": {
            "input_tokens": 1110,
            "output_tokens": 26,
            "latency_ms": 726.5,
            "usd": 6.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.3333,
      "score_summary": {
        "mean": 0.3333,
        "stdev": 0.3334,
        "sem": 0.1925,
        "ci95_lo": -0.0439,
        "ci95_hi": 0.7106,
        "n": 3
      },
      "score_if_valid": 0.3333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1110.0,
        "output_tokens_mean": 25.0,
        "latency_ms_mean": 739.933,
        "usd_sum": 0.000196
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-004",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "b3249dafda966f70",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS explanation codes that best explain the success of the pairing below. Respond as a JSON array (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"seared foie gras on brioche (very rich, fatty, slightly sweet-savoury)\",\n  \"beverage\": \"Sauternes (botrytised sweet white, high residual sugar, high acidity)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_SWEET_OFFSET",
          "STRUCT_ACIDITY_BALANCE",
          "CONTRAST_RICH_CRISP",
          "HARMONY_REGIONAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_SWEET_OFFSET\",\"CONTRAST_SWEET_ACID\"]",
          "compliant": true,
          "score": 0.2857,
          "detail": {
            "predicted": [
              "CONTRAST_SWEET_ACID",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.3333,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1130,
            "output_tokens": 26,
            "latency_ms": 826.8,
            "usd": 6.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\",\"STRUCT_ACIDITY_BALANCE\",\"STRUCT_TANNIN_FAT\",\"CONTRAST_SWEET_SALTY\"]",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": [
              "CONTRAST_SWEET_SALTY",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.5,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1130,
            "output_tokens": 34,
            "latency_ms": 756.7,
            "usd": 7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_SWEET_OFFSET\",\"STRUCT_ACIDITY_BALANCE\",\"harmony.HARMONY_FLAVOR_CONVERGE\"]",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": [
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT",
              "harmony.HARMONY_FLAVOR_CONVERGE"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.5,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1130,
            "output_tokens": 35,
            "latency_ms": 762.0,
            "usd": 7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.4286,
      "score_summary": {
        "mean": 0.4286,
        "stdev": 0.1237,
        "sem": 0.0714,
        "ci95_lo": 0.2886,
        "ci95_hi": 0.5686,
        "n": 3
      },
      "score_if_valid": 0.4286,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1130.0,
        "output_tokens_mean": 31.667,
        "latency_ms_mean": 781.833,
        "usd_sum": 0.000207
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-005",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "0cb2f1946943a0e2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS explanation codes that best explain why the pairing works. Respond as a JSON array (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"charcoal-grilled lamb rack with rosemary, black pepper crust, charred edges\",\n  \"beverage\": \"Northern Rhone Syrah (Cote-Rotie) — black pepper, violet, smoked meat, moderate-high tannin\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "AROMA_SHARED_SPICE",
          "AROMA_SHARED_WOOD",
          "AROMA_SHARED_HERBAL",
          "STRUCT_TANNIN_FAT",
          "STRUCT_TANNIN_PROTEIN"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_TANNIN_FAT\",\"aromatic_bridge\",\"positive_structural\"]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN",
              "aromatic_bridge",
              "positive_structural"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "AROMA_SHARED_SPICE",
              "AROMA_SHARED_WOOD",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.5,
            "recall": 0.4
          },
          "metrics": {
            "input_tokens": 1135,
            "output_tokens": 27,
            "latency_ms": 719.8,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"positive_structural:STRUCT_TANNIN_PROTEIN\",\"positive_structural:STRUCT_TANNIN_FAT\",\"aromatic_bridge:AROMA_SHARED_SPICE\",\"aromatic_bridge:AROMA_SHARED_EARTH\",\"refresh:REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": [
              "aromatic_bridge:AROMA_SHARED_EARTH",
              "aromatic_bridge:AROMA_SHARED_SPICE",
              "positive_structural:STRUCT_TANNIN_FAT",
              "positive_structural:STRUCT_TANNIN_PROTEIN",
              "refresh:REFRESH_ACID"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "AROMA_SHARED_SPICE",
              "AROMA_SHARED_WOOD",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.0,
            "recall": 0.0
          },
          "metrics": {
            "input_tokens": 1135,
            "output_tokens": 55,
            "latency_ms": 854.8,
            "usd": 7.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_TANNIN_FAT\",\"AROMATIC_BRIDGE_AROMA_SHARED_SPICE\",\"contrast_RICH_CRISP\"]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "AROMATIC_BRIDGE_AROMA_SHARED_SPICE",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN",
              "contrast_RICH_CRISP"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "AROMA_SHARED_SPICE",
              "AROMA_SHARED_WOOD",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.5,
            "recall": 0.4
          },
          "metrics": {
            "input_tokens": 1135,
            "output_tokens": 36,
            "latency_ms": 688.2,
            "usd": 7.1e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.2963,
      "score_summary": {
        "mean": 0.2963,
        "stdev": 0.2566,
        "sem": 0.1481,
        "ci95_lo": 0.0059,
        "ci95_hi": 0.5866,
        "n": 3
      },
      "score_if_valid": 0.2963,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1135.0,
        "output_tokens_mean": 39.333,
        "latency_ms_mean": 754.267,
        "usd_sum": 0.000218
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-006",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "9a46b0d141442ce3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"Pizza Margherita (tomato, mozzarella, basil)\",\n  \"beverage\": \"Chianti Classico (Sangiovese, high acid, medium tannin)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_ACIDITY_BALANCE",
          "HARMONY_REGIONAL",
          "STRUCT_INTENSITY_MATCH",
          "AROMA_SHARED_HERBAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_ACIDITY_BALANCE\",\"STRUCT_TANNIN_FAT\",\"REFRESH_ACID\",\"HARMONY_FLAVOR_CONVERGE\"]",
          "compliant": true,
          "score": 0.25,
          "detail": {
            "predicted": [
              "HARMONY_FLAVOR_CONVERGE",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_INTENSITY_MATCH"
            ],
            "precision": 0.25,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1126,
            "output_tokens": 32,
            "latency_ms": 793.1,
            "usd": 6.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_ACIDITY_BALANCE\",\"AROMA_SHARED_HERBAL\",\"STRUCT_TANNIN_PROTEIN\"]",
          "compliant": true,
          "score": 0.5714,
          "detail": {
            "predicted": [
              "AROMA_SHARED_HERBAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_INTENSITY_MATCH"
            ],
            "precision": 0.6667,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1126,
            "output_tokens": 26,
            "latency_ms": 646.0,
            "usd": 6.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_ACIDITY_BALANCE\",\"STRUCT_TANNIN_PROTEIN\",\"HARMONY_FLAVOR_CONVERGE\"]",
          "compliant": true,
          "score": 0.2857,
          "detail": {
            "predicted": [
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_INTENSITY_MATCH"
            ],
            "precision": 0.3333,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1126,
            "output_tokens": 28,
            "latency_ms": 781.7,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.369,
      "score_summary": {
        "mean": 0.369,
        "stdev": 0.1762,
        "sem": 0.1017,
        "ci95_lo": 0.1697,
        "ci95_hi": 0.5684,
        "n": 3
      },
      "score_if_valid": 0.369,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1126.0,
        "output_tokens_mean": 28.667,
        "latency_ms_mean": 740.267,
        "usd_sum": 0.000204
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-007",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "hard",
      "cuisine_region": "chinese",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "497adc64f3303eb4",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"mapo tofu with Sichuan peppercorn\",\n  \"beverage\": \"off-dry German Spätlese Riesling\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_SWEET_OFFSET",
          "REFRESH_ACID",
          "STRUCT_INTENSITY_MATCH",
          "CONTRAST_SWEET_SALTY"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\",\"STRUCT_ACIDITY_BALANCE\",\"CONTRAST_SWEET_ACID\",\"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": [
              "CONTRAST_SWEET_ACID",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.5,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1117,
            "output_tokens": 31,
            "latency_ms": 699.4,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\n  \"STRUCT_TANNIN_PROTEIN\",\n  \"STRUCT_SWEET_OFFSET\",\n  \"REFRESH_ACID\",\n  \"CONTRAST_SWEET_ACID\",\n  \"AROMA_SHARED_SPICE\"\n]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "AROMA_SHARED_SPICE",
              "CONTRAST_SWEET_ACID",
              "REFRESH_ACID",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.4,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1117,
            "output_tokens": 49,
            "latency_ms": 851.0,
            "usd": 7.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\",\"STRUCT_ACIDITY_BALANCE\",\"REFRESH_ACID\",\"CONTRAST_SWEET_ACID\"]",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": [
              "CONTRAST_SWEET_ACID",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.5,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1117,
            "output_tokens": 31,
            "latency_ms": 714.4,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.4815,
      "score_summary": {
        "mean": 0.4815,
        "stdev": 0.0321,
        "sem": 0.0185,
        "ci95_lo": 0.4451,
        "ci95_hi": 0.5178,
        "n": 3
      },
      "score_if_valid": 0.4815,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1117.0,
        "output_tokens_mean": 37.0,
        "latency_ms_mean": 754.933,
        "usd_sum": 0.000211
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-008",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "2120f96a0444a50e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"herb-crusted rack of lamb, medium-rare\",\n  \"beverage\": \"Northern Rhône Syrah\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_TANNIN_PROTEIN",
          "AROMA_SHARED_HERBAL",
          "STRUCT_BODY_ALIGN",
          "HARMONY_FLAVOR_CONVERGE"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_ACIDITY_BALANCE\",\"AROMA_SHARED_HERBAL\",\"AROMA_SHARED_EARTH\"]",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "AROMA_SHARED_HERBAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.5,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1114,
            "output_tokens": 33,
            "latency_ms": 731.0,
            "usd": 6.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_ACIDITY_BALANCE\",\"HARMONY_FLAVOR_CONVERGE\"]",
          "compliant": true,
          "score": 0.5714,
          "detail": {
            "predicted": [
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.6667,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1114,
            "output_tokens": 28,
            "latency_ms": 640.6,
            "usd": 6.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"HARMONY_FLAVOR_CONVERGE\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_ACIDITY_BALANCE\"]",
          "compliant": true,
          "score": 0.5714,
          "detail": {
            "predicted": [
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.6667,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1114,
            "output_tokens": 28,
            "latency_ms": 825.2,
            "usd": 6.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.5476,
      "score_summary": {
        "mean": 0.5476,
        "stdev": 0.0412,
        "sem": 0.0238,
        "ci95_lo": 0.501,
        "ci95_hi": 0.5942,
        "n": 3
      },
      "score_if_valid": 0.5476,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1114.0,
        "output_tokens_mean": 29.667,
        "latency_ms_mean": 732.267,
        "usd_sum": 0.000203
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-009",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "hard",
      "cuisine_region": "thai",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "c077c498ceed5f73",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"Thai green curry with chicken and Thai basil\",\n  \"beverage\": \"off-dry New Zealand Sauvignon Blanc\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_SWEET_OFFSET",
          "AROMA_SHARED_HERBAL",
          "REFRESH_ACID",
          "STRUCT_INTENSITY_MATCH"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"aromatic_bridge\",\"CONTRAST_SWEET_ACID\",\"STRUCT_ACIDITY_BALANCE\",\"REFRESH_ACID\",\"STRUCT_SWEET_OFFSET\"]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "CONTRAST_SWEET_ACID",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET",
              "aromatic_bridge"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.4,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1114,
            "output_tokens": 35,
            "latency_ms": 756.5,
            "usd": 7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"CONTRAST_SWEET_ACID\",\"STRUCT_ACIDITY_BALANCE\",\"AROMA_SHARED_HERBAL\",\"STRUCT_SWEET_OFFSET\"]",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": [
              "AROMA_SHARED_HERBAL",
              "CONTRAST_SWEET_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.5,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1114,
            "output_tokens": 33,
            "latency_ms": 770.6,
            "usd": 6.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"aromatic_bridge\",\"STRUCT_ACIDITY_BALANCE\",\"STRUCT_SWEET_OFFSET\",\"CONTRAST_SWEET_ACID\"]",
          "compliant": true,
          "score": 0.25,
          "detail": {
            "predicted": [
              "CONTRAST_SWEET_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET",
              "aromatic_bridge"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.25,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1114,
            "output_tokens": 30,
            "latency_ms": 868.7,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.3981,
      "score_summary": {
        "mean": 0.3981,
        "stdev": 0.1313,
        "sem": 0.0758,
        "ci95_lo": 0.2496,
        "ci95_hi": 0.5467,
        "n": 3
      },
      "score_if_valid": 0.3981,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1114.0,
        "output_tokens_mean": 32.667,
        "latency_ms_mean": 798.6,
        "usd_sum": 0.000207
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-010",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "956c6ce654c2e297",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"moules marinières\",\n  \"beverage\": \"Muscadet sur lie\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_ACIDITY_BALANCE",
          "REFRESH_ACID",
          "AROMA_SHARED_HERBAL",
          "HARMONY_REGIONAL",
          "REFRESH_MINERAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"REFRESH_ACID\",\"REFRESH_MINERAL\",\"STRUCT_ACIDITY_BALANCE\"]",
          "compliant": true,
          "score": 0.75,
          "detail": {
            "predicted": [
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "precision": 1.0,
            "recall": 0.6
          },
          "metrics": {
            "input_tokens": 1110,
            "output_tokens": 21,
            "latency_ms": 626.4,
            "usd": 6.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"REFRESH_ACID\",\"REFRESH_MINERAL\",\"STRUCT_ACIDITY_BALANCE\"]",
          "compliant": true,
          "score": 0.75,
          "detail": {
            "predicted": [
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "precision": 1.0,
            "recall": 0.6
          },
          "metrics": {
            "input_tokens": 1110,
            "output_tokens": 21,
            "latency_ms": 663.3,
            "usd": 6.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"REFRESH_EFFERVESCE\",\"REFRESH_ACID\",\"STRUCT_BODY_ALIGN\",\"STRUCT_TANNIN_PROTEIN\"]",
          "compliant": true,
          "score": 0.2222,
          "detail": {
            "predicted": [
              "REFRESH_ACID",
              "REFRESH_EFFERVESCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "precision": 0.25,
            "recall": 0.2
          },
          "metrics": {
            "input_tokens": 1110,
            "output_tokens": 29,
            "latency_ms": 732.4,
            "usd": 6.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.5741,
      "score_summary": {
        "mean": 0.5741,
        "stdev": 0.3047,
        "sem": 0.1759,
        "ci95_lo": 0.2292,
        "ci95_hi": 0.9189,
        "n": 3
      },
      "score_if_valid": 0.5741,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1110.0,
        "output_tokens_mean": 23.667,
        "latency_ms_mean": 674.033,
        "usd_sum": 0.000195
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-011",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "hard",
      "cuisine_region": "mexican",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "1dfeb782059a4814",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"tacos al pastor with pineapple and chili\",\n  \"beverage\": \"dry sparkling rosé (Cava rosado)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_SWEET_OFFSET",
          "REFRESH_EFFERVESCE",
          "AROMA_SHARED_FRUIT",
          "CONTRAST_RICH_CRISP"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"REFRESH_EFFERVESCE\",\"REFRESH_ACID\",\"AROMA_SHARED_FRUIT\",\"CONTRAST_TEMP_TEXTURE\",\"STRUCT_TANNIN_FAT\"]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_TEMP_TEXTURE",
              "REFRESH_ACID",
              "REFRESH_EFFERVESCE",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_RICH_CRISP",
              "REFRESH_EFFERVESCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.4,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1116,
            "output_tokens": 36,
            "latency_ms": 719.1,
            "usd": 7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"REFRESH_EFFERVESCE\",\"REFRESH_ACID\",\"AROMA_SHARED_FRUIT\",\"CONTRAST_SWEET_ACID\"]",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_SWEET_ACID",
              "REFRESH_ACID",
              "REFRESH_EFFERVESCE"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_RICH_CRISP",
              "REFRESH_EFFERVESCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.5,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1116,
            "output_tokens": 32,
            "latency_ms": 779.9,
            "usd": 6.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"REFRESH_EFFERVESCE\",\"REFRESH_ACID\",\"CONTRAST_SWEET_SALTY\",\"AROMA_SHARED_FRUIT\"]",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_SWEET_SALTY",
              "REFRESH_ACID",
              "REFRESH_EFFERVESCE"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_RICH_CRISP",
              "REFRESH_EFFERVESCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.5,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1116,
            "output_tokens": 33,
            "latency_ms": 626.8,
            "usd": 6.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.4815,
      "score_summary": {
        "mean": 0.4815,
        "stdev": 0.0321,
        "sem": 0.0185,
        "ci95_lo": 0.4451,
        "ci95_hi": 0.5178,
        "n": 3
      },
      "score_if_valid": 0.4815,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1116.0,
        "output_tokens_mean": 33.667,
        "latency_ms_mean": 708.6,
        "usd_sum": 0.000208
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-012",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "28df36c8ae155697",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"duck confit, crispy skin\",\n  \"beverage\": \"mature Burgundy Pinot Noir (10 years)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_TANNIN_FAT",
          "AROMA_SHARED_EARTH",
          "HARMONY_REGIONAL",
          "HARMONY_COMPLEXITY_MATCH"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_INTENSITY_MATCH\",\"STRUCT_BODY_ALIGN\",\"STRUCT_ACIDITY_BALANCE\"]",
          "compliant": true,
          "score": 0.25,
          "detail": {
            "predicted": [
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "AROMA_SHARED_EARTH",
              "HARMONY_COMPLEXITY_MATCH",
              "HARMONY_REGIONAL",
              "STRUCT_TANNIN_FAT"
            ],
            "precision": 0.25,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1113,
            "output_tokens": 28,
            "latency_ms": 631.5,
            "usd": 6.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_ACIDITY_BALANCE\",\"CONTRAST_RICH_CRISP\"]",
          "compliant": true,
          "score": 0.2857,
          "detail": {
            "predicted": [
              "CONTRAST_RICH_CRISP",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "AROMA_SHARED_EARTH",
              "HARMONY_COMPLEXITY_MATCH",
              "HARMONY_REGIONAL",
              "STRUCT_TANNIN_FAT"
            ],
            "precision": 0.3333,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1113,
            "output_tokens": 26,
            "latency_ms": 724.1,
            "usd": 6.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_ACIDITY_BALANCE\",\"STRUCT_BODY_ALIGN\",\"CONTRAST_RICH_CRISP\"]",
          "compliant": true,
          "score": 0.25,
          "detail": {
            "predicted": [
              "CONTRAST_RICH_CRISP",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "AROMA_SHARED_EARTH",
              "HARMONY_COMPLEXITY_MATCH",
              "HARMONY_REGIONAL",
              "STRUCT_TANNIN_FAT"
            ],
            "precision": 0.25,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1113,
            "output_tokens": 30,
            "latency_ms": 753.6,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.2619,
      "score_summary": {
        "mean": 0.2619,
        "stdev": 0.0206,
        "sem": 0.0119,
        "ci95_lo": 0.2386,
        "ci95_hi": 0.2852,
        "n": 3
      },
      "score_if_valid": 0.2619,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1113.0,
        "output_tokens_mean": 28.0,
        "latency_ms_mean": 703.067,
        "usd_sum": 0.000201
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-013",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "spanish",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "18a20e7e30844c41",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"jamón ibérico de bellota, sliced thin\",\n  \"beverage\": \"chilled Fino sherry (Manzanilla)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_UMAMI_ENHANCE",
          "HARMONY_REGIONAL",
          "AROMA_SHARED_SAVORY",
          "REFRESH_MINERAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_SWEET_OFFSET\",\"REFRESH_MINERAL\",\"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.25,
          "detail": {
            "predicted": [
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_SAVORY",
              "HARMONY_REGIONAL",
              "REFRESH_MINERAL",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "precision": 0.25,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1119,
            "output_tokens": 28,
            "latency_ms": 648.3,
            "usd": 6.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_ACIDITY_BALANCE\",\"HARMONY_REGIONAL\"]",
          "compliant": true,
          "score": 0.2857,
          "detail": {
            "predicted": [
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_SAVORY",
              "HARMONY_REGIONAL",
              "REFRESH_MINERAL",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "precision": 0.3333,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1119,
            "output_tokens": 25,
            "latency_ms": 751.2,
            "usd": 6.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"REFRESH_ACID\",\"REFRESH_MINERAL\"]",
          "compliant": true,
          "score": 0.2857,
          "detail": {
            "predicted": [
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_SAVORY",
              "HARMONY_REGIONAL",
              "REFRESH_MINERAL",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "precision": 0.3333,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1119,
            "output_tokens": 22,
            "latency_ms": 900.3,
            "usd": 6.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.2738,
      "score_summary": {
        "mean": 0.2738,
        "stdev": 0.0206,
        "sem": 0.0119,
        "ci95_lo": 0.2505,
        "ci95_hi": 0.2971,
        "n": 3
      },
      "score_if_valid": 0.2738,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1119.0,
        "output_tokens_mean": 25.0,
        "latency_ms_mean": 766.6,
        "usd_sum": 0.000198
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-014",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "korean",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "f9be4ef2853d67b3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"bulgogi: soy-pear-garlic marinated grilled beef\",\n  \"beverage\": \"medium-bodied unoaked Merlot\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_TANNIN_PROTEIN",
          "STRUCT_SWEET_OFFSET",
          "AROMA_SHARED_FRUIT",
          "STRUCT_BODY_ALIGN"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_BODY_ALIGN\",\"STRUCT_ACIDITY_BALANCE\",\"STRUCT_UMAMI_ENHANCE\"]",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": [
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.5,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1119,
            "output_tokens": 31,
            "latency_ms": 743.6,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_INTENSITY_MATCH\",\"STRUCT_BODY_ALIGN\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_UMAMI_ENHANCE\"]",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": [
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_PROTEIN",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.5,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1119,
            "output_tokens": 30,
            "latency_ms": 703.0,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_BODY_ALIGN\",\"STRUCT_INTENSITY_MATCH\",\"STRUCT_ACIDITY_BALANCE\"]",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": [
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.5,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1119,
            "output_tokens": 29,
            "latency_ms": 612.1,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.5,
      "score_summary": {
        "mean": 0.5,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.5,
        "ci95_hi": 0.5,
        "n": 3
      },
      "score_if_valid": 0.5,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1119.0,
        "output_tokens_mean": 30.0,
        "latency_ms_mean": 686.233,
        "usd_sum": 0.000204
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-015",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "easy",
      "cuisine_region": "british",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "5474f731e5d9751c",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"Stilton blue cheese\",\n  \"beverage\": \"Vintage Port\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "CONTRAST_SWEET_SALTY",
          "STRUCT_INTENSITY_MATCH",
          "HARMONY_COMPLEXITY_MATCH",
          "STRUCT_SWEET_OFFSET"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_SWEET_OFFSET\",\"AROMA_SHARED_EARTH\",\"CONTRAST_SWEET_ACID\"]",
          "compliant": true,
          "score": 0.25,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "CONTRAST_SWEET_ACID",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_COMPLEXITY_MATCH",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.25,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1106,
            "output_tokens": 33,
            "latency_ms": 767.1,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_UMAMI_ENHANCE\",\"STRUCT_SWEET_OFFSET\",\"HARMONY_FLAVOR_CONVERGE\"]",
          "compliant": true,
          "score": 0.25,
          "detail": {
            "predicted": [
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_COMPLEXITY_MATCH",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.25,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1106,
            "output_tokens": 34,
            "latency_ms": 747.0,
            "usd": 6.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_UMAMI_ENHANCE\",\"STRUCT_SWEET_OFFSET\"]",
          "compliant": true,
          "score": 0.2857,
          "detail": {
            "predicted": [
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_COMPLEXITY_MATCH",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.3333,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1106,
            "output_tokens": 25,
            "latency_ms": 889.5,
            "usd": 6.5e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.2619,
      "score_summary": {
        "mean": 0.2619,
        "stdev": 0.0206,
        "sem": 0.0119,
        "ci95_lo": 0.2386,
        "ci95_hi": 0.2852,
        "n": 3
      },
      "score_if_valid": 0.2619,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1106.0,
        "output_tokens_mean": 30.667,
        "latency_ms_mean": 801.2,
        "usd_sum": 0.000202
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-016",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "hard",
      "cuisine_region": "japanese",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "fc7f227aebed6756",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"shrimp tempura, light fry\",\n  \"beverage\": \"chilled junmai ginjo sake\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_UMAMI_ENHANCE",
          "REFRESH_MINERAL",
          "AROMA_SHARED_FLORAL",
          "STRUCT_INTENSITY_MATCH",
          "HARMONY_REGIONAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"HARMONY_FLAVOR_CONVERGE\",\"CONTRAST_RICH_CRISP\",\"STRUCT_INTENSITY_MATCH\"]",
          "compliant": true,
          "score": 0.25,
          "detail": {
            "predicted": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_INTENSITY_MATCH"
            ],
            "expected": [
              "AROMA_SHARED_FLORAL",
              "HARMONY_REGIONAL",
              "REFRESH_MINERAL",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "precision": 0.3333,
            "recall": 0.2
          },
          "metrics": {
            "input_tokens": 1113,
            "output_tokens": 27,
            "latency_ms": 828.5,
            "usd": 6.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"REFRESH_ACID\",\"STRUCT_BODY_ALIGN\",\"STRUCT_INTENSITY_MATCH\",\"HARMONY_FLAVOR_CONVERGE\"]",
          "compliant": true,
          "score": 0.2222,
          "detail": {
            "predicted": [
              "HARMONY_FLAVOR_CONVERGE",
              "REFRESH_ACID",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH"
            ],
            "expected": [
              "AROMA_SHARED_FLORAL",
              "HARMONY_REGIONAL",
              "REFRESH_MINERAL",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "precision": 0.25,
            "recall": 0.2
          },
          "metrics": {
            "input_tokens": 1113,
            "output_tokens": 28,
            "latency_ms": 683.7,
            "usd": 6.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_BODY_ALIGN\",\"STRUCT_ACIDITY_BALANCE\",\"REFRESH_ACID\",\"HARMONY_FLAVOR_CONVERGE\"]",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": [
              "HARMONY_FLAVOR_CONVERGE",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN"
            ],
            "expected": [
              "AROMA_SHARED_FLORAL",
              "HARMONY_REGIONAL",
              "REFRESH_MINERAL",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "precision": 0.0,
            "recall": 0.0
          },
          "metrics": {
            "input_tokens": 1113,
            "output_tokens": 29,
            "latency_ms": 1042.5,
            "usd": 6.7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.1574,
      "score_summary": {
        "mean": 0.1574,
        "stdev": 0.137,
        "sem": 0.0791,
        "ci95_lo": 0.0023,
        "ci95_hi": 0.3125,
        "n": 3
      },
      "score_if_valid": 0.1574,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1113.0,
        "output_tokens_mean": 28.0,
        "latency_ms_mean": 851.567,
        "usd_sum": 0.0002
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-017",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "greek",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "024198497e327bb1",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"horiatiki salad with feta, tomato, olive, oregano\",\n  \"beverage\": \"Santorini Assyrtiko\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "REFRESH_ACID",
          "REFRESH_MINERAL",
          "HARMONY_REGIONAL",
          "AROMA_SHARED_HERBAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"aromatic_bridge\",\"positive_structural\",\"refresh\"]",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": [
              "aromatic_bridge",
              "positive_structural",
              "refresh"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL"
            ],
            "precision": 0.0,
            "recall": 0.0
          },
          "metrics": {
            "input_tokens": 1118,
            "output_tokens": 14,
            "latency_ms": 666.6,
            "usd": 6.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"AROMA_SHARED_HERBAL\",\"AROMA_SHARED_EARTH\",\"STRUCT_ACIDITY_BALANCE\"]",
          "compliant": true,
          "score": 0.2857,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "AROMA_SHARED_HERBAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL"
            ],
            "precision": 0.3333,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1118,
            "output_tokens": 25,
            "latency_ms": 656.9,
            "usd": 6.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"aromatic_bridge\",\"HARMONY_FLAVOR_CONVERGE\",\"STRUCT_ACIDITY_BALANCE\",\"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.25,
          "detail": {
            "predicted": [
              "HARMONY_FLAVOR_CONVERGE",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "aromatic_bridge"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL"
            ],
            "precision": 0.25,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1118,
            "output_tokens": 29,
            "latency_ms": 788.0,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.1786,
      "score_summary": {
        "mean": 0.1786,
        "stdev": 0.1557,
        "sem": 0.0899,
        "ci95_lo": 0.0024,
        "ci95_hi": 0.3547,
        "n": 3
      },
      "score_if_valid": 0.1786,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1118.0,
        "output_tokens_mean": 22.667,
        "latency_ms_mean": 703.833,
        "usd_sum": 0.000196
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-018",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "hard",
      "cuisine_region": "chinese",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "283bbb9e703e6e30",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"Peking duck with pancake, scallion, hoisin\",\n  \"beverage\": \"10-year-old off-dry Mosel Riesling\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_SWEET_OFFSET",
          "AROMA_SHARED_FRUIT",
          "REFRESH_ACID",
          "HARMONY_COMPLEXITY_MATCH"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_ACIDITY_BALANCE\",\"STRUCT_TANNIN_FAT\",\"STRUCT_SWEET_OFFSET\",\"AROMA_SHARED_SPICE\",\"HARMONY_FLAVOR_CONVERGE\"]",
          "compliant": true,
          "score": 0.2222,
          "detail": {
            "predicted": [
              "AROMA_SHARED_SPICE",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "HARMONY_COMPLEXITY_MATCH",
              "REFRESH_ACID",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.2,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1120,
            "output_tokens": 39,
            "latency_ms": 1061.9,
            "usd": 7.2e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\",\"STRUCT_ACIDITY_BALANCE\",\"AROMA_SHARED_HERBAL\",\"HARMONY_FLAVOR_CONVERGE\"]",
          "compliant": true,
          "score": 0.25,
          "detail": {
            "predicted": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "HARMONY_COMPLEXITY_MATCH",
              "REFRESH_ACID",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.25,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1120,
            "output_tokens": 33,
            "latency_ms": 649.7,
            "usd": 6.9e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\",\"STRUCT_ACIDITY_BALANCE\",\"STRUCT_UMAMI_ENHANCE\"]",
          "compliant": true,
          "score": 0.2857,
          "detail": {
            "predicted": [
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "HARMONY_COMPLEXITY_MATCH",
              "REFRESH_ACID",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.3333,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1120,
            "output_tokens": 25,
            "latency_ms": 857.0,
            "usd": 6.6e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.2526,
      "score_summary": {
        "mean": 0.2526,
        "stdev": 0.0318,
        "sem": 0.0184,
        "ci95_lo": 0.2166,
        "ci95_hi": 0.2887,
        "n": 3
      },
      "score_if_valid": 0.2526,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1120.0,
        "output_tokens_mean": 32.333,
        "latency_ms_mean": 856.2,
        "usd_sum": 0.000207
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-019",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "9ea6cd79384f9ee2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"Sunday ragù napoletano (long-cooked beef and pork)\",\n  \"beverage\": \"Campanian Aglianico, 5 years\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_TANNIN_PROTEIN",
          "HARMONY_REGIONAL",
          "STRUCT_BODY_ALIGN",
          "AROMA_SHARED_WOOD"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_ALCOHOL_WARMTH\",\"HARMONY_REGIONAL\",\"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.25,
          "detail": {
            "predicted": [
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "STRUCT_ALCOHOL_WARMTH",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "AROMA_SHARED_WOOD",
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.25,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1123,
            "output_tokens": 30,
            "latency_ms": 720.2,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"HARMONY_FLAVOR_CONVERGE\",\"STRUCT_TANNIN_FAT\",\"STRUCT_UMAMI_ENHANCE\",\"STRUCT_ALCOHOL_WARMTH\",\"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": [
              "HARMONY_FLAVOR_CONVERGE",
              "REFRESH_ACID",
              "STRUCT_ALCOHOL_WARMTH",
              "STRUCT_TANNIN_FAT",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "expected": [
              "AROMA_SHARED_WOOD",
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.0,
            "recall": 0.0
          },
          "metrics": {
            "input_tokens": 1123,
            "output_tokens": 41,
            "latency_ms": 721.6,
            "usd": 7.3e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"HARMONY_REGIONAL\",\"STRUCT_INTENSITY_MATCH\",\"STRUCT_BODY_ALIGN\",\"STRUCT_TANNIN_FAT\",\"STRUCT_ALCOHOL_WARMTH\"]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "HARMONY_REGIONAL",
              "STRUCT_ALCOHOL_WARMTH",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "AROMA_SHARED_WOOD",
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.4,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1123,
            "output_tokens": 35,
            "latency_ms": 881.5,
            "usd": 7e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.2315,
      "score_summary": {
        "mean": 0.2315,
        "stdev": 0.2228,
        "sem": 0.1286,
        "ci95_lo": -0.0206,
        "ci95_hi": 0.4836,
        "n": 3
      },
      "score_if_valid": 0.2315,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1123.0,
        "output_tokens_mean": 35.333,
        "latency_ms_mean": 774.433,
        "usd_sum": 0.000211
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-020",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "43005f44c870684b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"foie gras torchon with brioche\",\n  \"beverage\": \"Sauternes\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_SWEET_OFFSET",
          "CONTRAST_SWEET_SALTY",
          "HARMONY_REGIONAL",
          "STRUCT_BODY_ALIGN"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\",\"STRUCT_ACIDITY_BALANCE\",\"STRUCT_UMAMI_ENHANCE\",\"STRUCT_TANNIN_FAT\"]",
          "compliant": true,
          "score": 0.25,
          "detail": {
            "predicted": [
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.25,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1110,
            "output_tokens": 32,
            "latency_ms": 588.8,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_SWEET_OFFSET\",\"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.2857,
          "detail": {
            "predicted": [
              "REFRESH_ACID",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.3333,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1110,
            "output_tokens": 22,
            "latency_ms": 718.0,
            "usd": 6.4e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_ACIDITY_BALANCE\",\"STRUCT_TANNIN_FAT\",\"STRUCT_UMAMI_ENHANCE\",\"STRUCT_BODY_ALIGN\"]",
          "compliant": true,
          "score": 0.25,
          "detail": {
            "predicted": [
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_FAT",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.25,
            "recall": 0.25
          },
          "metrics": {
            "input_tokens": 1110,
            "output_tokens": 30,
            "latency_ms": 801.3,
            "usd": 6.8e-05,
            "model_version": "gpt-5.4-nano-2026-03-17",
            "finish_reason": "stop"
          }
        }
      ],
      "score": 0.2619,
      "score_summary": {
        "mean": 0.2619,
        "stdev": 0.0206,
        "sem": 0.0119,
        "ci95_lo": 0.2386,
        "ci95_hi": 0.2852,
        "n": 3
      },
      "score_if_valid": 0.2619,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1110.0,
        "output_tokens_mean": 28.0,
        "latency_ms_mean": 702.7,
        "usd_sum": 0.0002
      }
    }
  ],
  "category_scores": {
    "composition": {
      "mean_score": 0.7756,
      "ci95_lo": 0.598,
      "ci95_hi": 0.9532,
      "stdev": 0.4052,
      "task_count": 20,
      "weight": 0.12,
      "format_rate": 1.0,
      "score_if_valid": 0.7756
    },
    "dietary": {
      "mean_score": 0.6205,
      "ci95_lo": 0.3897,
      "ci95_hi": 0.8513,
      "stdev": 0.4079,
      "task_count": 12,
      "weight": 0.06,
      "format_rate": 1.0,
      "score_if_valid": 0.6205
    },
    "explanation": {
      "mean_score": 0.3592,
      "ci95_lo": 0.3037,
      "ci95_hi": 0.4147,
      "stdev": 0.1267,
      "task_count": 20,
      "weight": 0.1,
      "format_rate": 1.0,
      "score_if_valid": 0.3592
    },
    "menu_engineering": {
      "mean_score": 0.875,
      "ci95_lo": 0.63,
      "ci95_hi": 1.12,
      "stdev": 0.3536,
      "task_count": 8,
      "weight": 0.03,
      "format_rate": 1.0,
      "score_if_valid": 0.875
    },
    "nutrition": {
      "mean_score": 0.3089,
      "ci95_lo": 0.0218,
      "ci95_hi": 0.5959,
      "stdev": 0.4631,
      "task_count": 10,
      "weight": 0.03,
      "format_rate": 1.0,
      "score_if_valid": 0.3089
    },
    "pairing": {
      "mean_score": 0.9159,
      "ci95_lo": 0.8141,
      "ci95_hi": 1.0177,
      "stdev": 0.2436,
      "task_count": 22,
      "weight": 0.15,
      "format_rate": 1.0,
      "score_if_valid": 0.9159
    },
    "regional": {
      "mean_score": 0.9167,
      "ci95_lo": 0.8184,
      "ci95_hi": 1.015,
      "stdev": 0.2457,
      "task_count": 24,
      "weight": 0.05,
      "format_rate": 1.0,
      "score_if_valid": 0.9167
    },
    "risk": {
      "mean_score": 0.7667,
      "ci95_lo": 0.6087,
      "ci95_hi": 0.9246,
      "stdev": 0.3603,
      "task_count": 20,
      "weight": 0.08,
      "format_rate": 1.0,
      "score_if_valid": 0.7667
    },
    "safety": {
      "mean_score": 0.8991,
      "ci95_lo": 0.7353,
      "ci95_hi": 1.063,
      "stdev": 0.2895,
      "task_count": 12,
      "weight": 0.06,
      "format_rate": 1.0,
      "score_if_valid": 0.8991
    },
    "sensory": {
      "mean_score": 0.4361,
      "ci95_lo": 0.2907,
      "ci95_hi": 0.5815,
      "stdev": 0.34,
      "task_count": 21,
      "weight": 0.12,
      "format_rate": 1.0,
      "score_if_valid": 0.4361
    },
    "structured_output": {
      "mean_score": 0.8233,
      "ci95_lo": 0.7187,
      "ci95_hi": 0.9279,
      "stdev": 0.2387,
      "task_count": 20,
      "weight": 0.12,
      "format_rate": 1.0,
      "score_if_valid": 0.8233
    },
    "vocabulary": {
      "mean_score": 0.7681,
      "ci95_lo": 0.6016,
      "ci95_hi": 0.9346,
      "stdev": 0.4074,
      "task_count": 23,
      "weight": 0.08,
      "format_rate": 1.0,
      "score_if_valid": 0.7681
    }
  },
  "aggregate": {
    "score": 71.28,
    "ci95_lo": 66.86,
    "ci95_hi": 75.71,
    "scale": "0-100",
    "partial": false,
    "missing_categories": []
  },
  "metrics": {
    "latency_ms_mean": 1124.439,
    "latency_ms_median": 652.8,
    "input_tokens_mean": 728.8,
    "output_tokens_mean": 79.8,
    "usd_total": 0.043676,
    "tasks_with_usd": 213
  },
  "weights": {
    "vocabulary": 0.08,
    "sensory": 0.12,
    "composition": 0.12,
    "pairing": 0.15,
    "explanation": 0.1,
    "risk": 0.08,
    "structured_output": 0.12,
    "regional": 0.05,
    "dietary": 0.06,
    "safety": 0.06,
    "nutrition": 0.03,
    "menu_engineering": 0.03
  },
  "judge": {
    "judge_model": "gpt-5.4",
    "judge_prompt_hash": "756db6a487714bca",
    "judge_validated": false,
    "excluded_task_ids": [
      "ogs:core:benchmark-task:menu-009"
    ],
    "note": "Rubric-judge tasks are scored and retained in `tasks`, but excluded from `aggregate` until the judge clears the human-validation protocol (r > 0.8 against a 50-task sample)."
  }
}