{
  "benchmark_version": "0.3.0",
  "ogs_version": "0.2.0",
  "run_id": "5418315e-deca-460c-92ed-f0c24f5dac12",
  "run_at": "2026-04-18T06:44:29.876782+00:00",
  "model": {
    "id": "claude-sonnet-4-6",
    "adapter": "anthropic",
    "options": {}
  },
  "config": {
    "samples": 3,
    "closed_book": false,
    "seed": null,
    "skip_disputed": true
  },
  "environment": {
    "python": "3.14.3",
    "platform": "macOS-26.4.1-arm64-arm-64bit-Mach-O",
    "sdk_versions": {
      "openai": "2.32.0",
      "anthropic": "0.96.0",
      "jsonschema": "4.26.0"
    }
  },
  "reproducibility": {
    "task_set_sha": "345fbe2cedeac504",
    "task_count": 213,
    "prompt_hashes": {
      "ogs:core:benchmark-task:comp-001": "5dc05e883ee75cf7",
      "ogs:core:benchmark-task:comp-002": "d5fef8891eee5391",
      "ogs:core:benchmark-task:comp-003": "7b7db2c313e1d55b",
      "ogs:core:benchmark-task:comp-004": "ca8109a73af762fa",
      "ogs:core:benchmark-task:comp-005": "2885728399552fb5",
      "ogs:core:benchmark-task:comp-006": "bb37bf992b2f6c39",
      "ogs:core:benchmark-task:comp-007": "59ae41132f38bbfd",
      "ogs:core:benchmark-task:comp-008": "5121ce4bf46aa4d5",
      "ogs:core:benchmark-task:comp-009": "2a8a86bb58372ed7",
      "ogs:core:benchmark-task:comp-010": "d420afcaea603a0e",
      "ogs:core:benchmark-task:comp-011": "1d18b06eac9f3ddc",
      "ogs:core:benchmark-task:comp-012": "fa91f74e86a4cffb",
      "ogs:core:benchmark-task:comp-013": "f8257062be99b5d3",
      "ogs:core:benchmark-task:comp-014": "114b0796c10cf141",
      "ogs:core:benchmark-task:comp-015": "07976f0ee97244e5",
      "ogs:core:benchmark-task:comp-016": "609a0643399a8df0",
      "ogs:core:benchmark-task:comp-017": "09a6e47eda264ded",
      "ogs:core:benchmark-task:comp-018": "32d4a921f4fbdd98",
      "ogs:core:benchmark-task:comp-019": "eaa27377a264e2e2",
      "ogs:core:benchmark-task:comp-020": "535bf59391539a15",
      "ogs:core:benchmark-task:diet-001": "18b5c9c7c958a521",
      "ogs:core:benchmark-task:diet-002": "04840e46f80ccb5b",
      "ogs:core:benchmark-task:diet-003": "013d6295fee472b5",
      "ogs:core:benchmark-task:diet-004": "85fc94958c4357ef",
      "ogs:core:benchmark-task:diet-005": "f40f4c25a2ee4b71",
      "ogs:core:benchmark-task:diet-006": "7f45e388301a68a5",
      "ogs:core:benchmark-task:diet-007": "eb092a2c162960f6",
      "ogs:core:benchmark-task:diet-008": "480be99b3f35a51d",
      "ogs:core:benchmark-task:diet-009": "936752891554dba3",
      "ogs:core:benchmark-task:diet-010": "35c29846f76046d7",
      "ogs:core:benchmark-task:diet-011": "bdd7970997c11172",
      "ogs:core:benchmark-task:diet-012": "42c897d741064981",
      "ogs:core:benchmark-task:menu-001": "138c09ed611a2d43",
      "ogs:core:benchmark-task:menu-002": "b5df321fd829de76",
      "ogs:core:benchmark-task:menu-003": "9795b72bdda437f1",
      "ogs:core:benchmark-task:menu-004": "fc0da6dcbbc842fb",
      "ogs:core:benchmark-task:menu-005": "f33710f88f89121e",
      "ogs:core:benchmark-task:menu-006": "7bb019ca9ca4b1be",
      "ogs:core:benchmark-task:menu-007": "68b7fb2ec1e432f4",
      "ogs:core:benchmark-task:menu-008": "dc5bf7ed3a0583c2",
      "ogs:core:benchmark-task:menu-009": "8883cacb0aca5024",
      "ogs:core:benchmark-task:ml-ar-001": "94e19d81d80368a1",
      "ogs:core:benchmark-task:ml-ar-002": "6b39ec376be689ad",
      "ogs:core:benchmark-task:ml-es-001": "5fe5c7f1f85e3675",
      "ogs:core:benchmark-task:ml-es-002": "9f03b90711bec40f",
      "ogs:core:benchmark-task:ml-fr-001": "8d252ae3fb170e15",
      "ogs:core:benchmark-task:ml-fr-002": "96a07aeab44536ab",
      "ogs:core:benchmark-task:ml-it-001": "c3aae88ebfcbe08b",
      "ogs:core:benchmark-task:ml-it-002": "30919f613b1fbcd6",
      "ogs:core:benchmark-task:ml-ja-001": "ff1f43723314705c",
      "ogs:core:benchmark-task:ml-ja-002": "f68d4882814e0ecf",
      "ogs:core:benchmark-task:nut-001": "94fadc6f5c8c5968",
      "ogs:core:benchmark-task:nut-002": "4b141daaadd9af57",
      "ogs:core:benchmark-task:nut-003": "518ee70ef096f9ec",
      "ogs:core:benchmark-task:nut-004": "42947503c82b308d",
      "ogs:core:benchmark-task:nut-005": "375ca5d44d346632",
      "ogs:core:benchmark-task:nut-006": "1ba6175ed99ad486",
      "ogs:core:benchmark-task:nut-007": "b9e5625ed0828c94",
      "ogs:core:benchmark-task:nut-008": "95ef839d58e96bbc",
      "ogs:core:benchmark-task:nut-009": "641416c8b33c6dbc",
      "ogs:core:benchmark-task:nut-010": "fd7edc20921469ff",
      "ogs:core:benchmark-task:pair-001": "3fa6a67d5bfe7f1a",
      "ogs:core:benchmark-task:pair-002": "4b85899cd6206acb",
      "ogs:core:benchmark-task:pair-003": "41ed3ee840823502",
      "ogs:core:benchmark-task:pair-004": "82a4b41d0e3d5668",
      "ogs:core:benchmark-task:pair-005": "441705237fe65e54",
      "ogs:core:benchmark-task:pair-006": "b195ee1dfe7968ab",
      "ogs:core:benchmark-task:pair-007": "81bf14f26f4fa761",
      "ogs:core:benchmark-task:pair-008": "51ad14586a063779",
      "ogs:core:benchmark-task:pair-009": "d7ade64ce48bfbe4",
      "ogs:core:benchmark-task:pair-010": "637717fdf3577204",
      "ogs:core:benchmark-task:pair-011": "2b655f8b1ea64df8",
      "ogs:core:benchmark-task:pair-012": "3fe344c62017694c",
      "ogs:core:benchmark-task:pair-013": "4d30c6e439c9c85f",
      "ogs:core:benchmark-task:pair-014": "848f37f91777acfb",
      "ogs:core:benchmark-task:pair-015": "f5ecf9dcd59f0d0a",
      "ogs:core:benchmark-task:pair-016": "3dd7c0170877ac9b",
      "ogs:core:benchmark-task:pair-017": "8f60336584f33baf",
      "ogs:core:benchmark-task:pair-018": "9209873bc05779b7",
      "ogs:core:benchmark-task:pair-019": "da735293a9d3102b",
      "ogs:core:benchmark-task:pair-020": "bac251ebb1c064ef",
      "ogs:core:benchmark-task:region-001": "1df867dd3a55af5b",
      "ogs:core:benchmark-task:region-002": "24a95f8a1c0a0a45",
      "ogs:core:benchmark-task:region-003": "15d36148f8b853e6",
      "ogs:core:benchmark-task:region-004": "e332af036a2bd591",
      "ogs:core:benchmark-task:region-005": "4aa2c63ffdeeacfb",
      "ogs:core:benchmark-task:region-006": "979e7579b34d9797",
      "ogs:core:benchmark-task:region-007": "23a69016b1082553",
      "ogs:core:benchmark-task:region-008": "0a12eb057eb17654",
      "ogs:core:benchmark-task:region-009": "767d178c3cce3333",
      "ogs:core:benchmark-task:region-010": "1001169bc92aaacd",
      "ogs:core:benchmark-task:region-011": "571e5074e66e6a11",
      "ogs:core:benchmark-task:region-012": "d9792be4d5ca7335",
      "ogs:core:benchmark-task:region-013": "c831a091be5025f9",
      "ogs:core:benchmark-task:region-014": "7d761cbfb86342af",
      "ogs:core:benchmark-task:region-015": "2f35d24bb9d58118",
      "ogs:core:benchmark-task:region-016": "efb489c56faf463a",
      "ogs:core:benchmark-task:region-017": "53530e4634162afc",
      "ogs:core:benchmark-task:region-018": "28a636bdc82633de",
      "ogs:core:benchmark-task:region-019": "2c5869ce479f65cc",
      "ogs:core:benchmark-task:region-020": "aa3761bdbf2d490e",
      "ogs:core:benchmark-task:risk-001": "08845fcf7f8beb09",
      "ogs:core:benchmark-task:risk-002": "7a188b424515258f",
      "ogs:core:benchmark-task:risk-003": "a60fa6dd5c1ff452",
      "ogs:core:benchmark-task:risk-004": "16286de7fea14ae1",
      "ogs:core:benchmark-task:risk-005": "cd44f8ac361108e8",
      "ogs:core:benchmark-task:risk-006": "28cb271326a709c7",
      "ogs:core:benchmark-task:risk-007": "93c46eac23eca249",
      "ogs:core:benchmark-task:risk-008": "2409e606629eebf9",
      "ogs:core:benchmark-task:risk-009": "a00498fd7fce7cb7",
      "ogs:core:benchmark-task:risk-010": "b28af5fc40407ce0",
      "ogs:core:benchmark-task:risk-011": "fba6f85afa81595f",
      "ogs:core:benchmark-task:risk-012": "2f3a4360a39afdc9",
      "ogs:core:benchmark-task:risk-013": "390e90cae060ada8",
      "ogs:core:benchmark-task:risk-014": "d83b8b2ecabe9907",
      "ogs:core:benchmark-task:risk-015": "1e83097d483e0639",
      "ogs:core:benchmark-task:risk-016": "b2e02272ca87b2d4",
      "ogs:core:benchmark-task:risk-017": "12372c8de13add55",
      "ogs:core:benchmark-task:risk-018": "7a97c002792dfad9",
      "ogs:core:benchmark-task:risk-019": "0e3d6c3f9d86af0b",
      "ogs:core:benchmark-task:risk-020": "2dd649d25c606ee7",
      "ogs:core:benchmark-task:safe-001": "8e37b185cfaed2aa",
      "ogs:core:benchmark-task:safe-002": "d38d6470a18f7e97",
      "ogs:core:benchmark-task:safe-003": "97f1d8e6fc44b9e2",
      "ogs:core:benchmark-task:safe-004": "9e5770b6c7036769",
      "ogs:core:benchmark-task:safe-005": "cb2ff73059921b59",
      "ogs:core:benchmark-task:safe-006": "b3c9f3a848d65390",
      "ogs:core:benchmark-task:safe-007": "f120f4a38737e981",
      "ogs:core:benchmark-task:safe-008": "9b8f975bd042c7a0",
      "ogs:core:benchmark-task:safe-009": "9f4b49c1c20e4376",
      "ogs:core:benchmark-task:safe-010": "eea6814938f18565",
      "ogs:core:benchmark-task:safe-011": "edc2981cec11c39e",
      "ogs:core:benchmark-task:safe-012": "2ba4113ffe26b2fc",
      "ogs:core:benchmark-task:sense-001": "913118356da8150b",
      "ogs:core:benchmark-task:sense-002": "2690028d89b60292",
      "ogs:core:benchmark-task:sense-003": "b09502eb4fd5d61f",
      "ogs:core:benchmark-task:sense-004": "94f009713056d52f",
      "ogs:core:benchmark-task:sense-005": "c950ec9851ad52bf",
      "ogs:core:benchmark-task:sense-006": "d68ef8562addd55d",
      "ogs:core:benchmark-task:sense-007": "658c14cce434cc0a",
      "ogs:core:benchmark-task:sense-008": "873c6c2f9608e43f",
      "ogs:core:benchmark-task:sense-009": "523e04f39252cb4b",
      "ogs:core:benchmark-task:sense-010": "1813cae53ba17a84",
      "ogs:core:benchmark-task:sense-011": "1886b2e009716d1d",
      "ogs:core:benchmark-task:sense-012": "8d40a83f1b501685",
      "ogs:core:benchmark-task:sense-013": "691f59088f3e98f3",
      "ogs:core:benchmark-task:sense-014": "c4a83a79265c0e9f",
      "ogs:core:benchmark-task:sense-015": "66d64ca2254c09ee",
      "ogs:core:benchmark-task:sense-016": "70318c6fd3b033d2",
      "ogs:core:benchmark-task:sense-017": "c3cedde89152ce76",
      "ogs:core:benchmark-task:sense-018": "7d31de11173a7fd2",
      "ogs:core:benchmark-task:sense-019": "2e7df899a0148ebd",
      "ogs:core:benchmark-task:sense-020": "ca5b9ab85fe71acb",
      "ogs:core:benchmark-task:struct-001": "4523d68b7da194f7",
      "ogs:core:benchmark-task:struct-002": "367b58556573440d",
      "ogs:core:benchmark-task:struct-003": "2740dc70d150e12a",
      "ogs:core:benchmark-task:struct-004": "9bd6860a8c5c69ec",
      "ogs:core:benchmark-task:struct-005": "9caf9bc474b2464d",
      "ogs:core:benchmark-task:struct-006": "bb4d0fcee2d1e751",
      "ogs:core:benchmark-task:struct-007": "0bc2e49ee6c1093e",
      "ogs:core:benchmark-task:struct-008": "791ac1beebd95233",
      "ogs:core:benchmark-task:struct-009": "a6a51cef64e43626",
      "ogs:core:benchmark-task:struct-010": "6d4d6d3cc23f3283",
      "ogs:core:benchmark-task:struct-011": "6615664480900680",
      "ogs:core:benchmark-task:struct-012": "bd330891a3e68b59",
      "ogs:core:benchmark-task:struct-013": "872d23f39307a374",
      "ogs:core:benchmark-task:struct-014": "3fe3fe74f8d80260",
      "ogs:core:benchmark-task:struct-015": "f0c01fa20f35eb51",
      "ogs:core:benchmark-task:struct-016": "284b25ae8f0dc3ef",
      "ogs:core:benchmark-task:struct-017": "534d8590759d6dc3",
      "ogs:core:benchmark-task:struct-018": "1470a28b8e9ef028",
      "ogs:core:benchmark-task:struct-019": "aa2726470aa7cfb8",
      "ogs:core:benchmark-task:struct-020": "5b115600e8dad71f",
      "ogs:core:benchmark-task:vocab-001": "e52270d817a9dc7f",
      "ogs:core:benchmark-task:vocab-002": "e2096242953ff882",
      "ogs:core:benchmark-task:vocab-003": "fa5fa32bfe275dfb",
      "ogs:core:benchmark-task:vocab-004": "e0631034b8b8ece4",
      "ogs:core:benchmark-task:vocab-005": "bbbd81bb3c8b8ea4",
      "ogs:core:benchmark-task:vocab-006": "6c6ae6f82f354be3",
      "ogs:core:benchmark-task:vocab-007": "0adac8c06022acf5",
      "ogs:core:benchmark-task:vocab-008": "e58319dc1276cdb2",
      "ogs:core:benchmark-task:vocab-009": "d51864ab27ecfa64",
      "ogs:core:benchmark-task:vocab-010": "1748069ad0ebdde7",
      "ogs:core:benchmark-task:vocab-011": "85de6add5c1ebfba",
      "ogs:core:benchmark-task:vocab-012": "44c9b26dc70c4557",
      "ogs:core:benchmark-task:vocab-013": "dc3e62daf8bdf21a",
      "ogs:core:benchmark-task:vocab-014": "3980456a47e9b7c6",
      "ogs:core:benchmark-task:vocab-015": "ad313a98974d64f7",
      "ogs:core:benchmark-task:vocab-016": "95e41ab82cba029a",
      "ogs:core:benchmark-task:vocab-017": "8070fb687fd2f108",
      "ogs:core:benchmark-task:vocab-018": "9e97b59361dbdf0e",
      "ogs:core:benchmark-task:vocab-019": "fc9a763682f1225b",
      "ogs:core:benchmark-task:vocab-020": "56ad4f9c58bc1eb7",
      "ogs:core:benchmark-task:xplain-001": "a75ad0ee2a385794",
      "ogs:core:benchmark-task:xplain-002": "bdc5d8d74c85e9ab",
      "ogs:core:benchmark-task:xplain-003": "3440bcf3978a1c58",
      "ogs:core:benchmark-task:xplain-004": "b3249dafda966f70",
      "ogs:core:benchmark-task:xplain-005": "0cb2f1946943a0e2",
      "ogs:core:benchmark-task:xplain-006": "9a46b0d141442ce3",
      "ogs:core:benchmark-task:xplain-007": "497adc64f3303eb4",
      "ogs:core:benchmark-task:xplain-008": "2120f96a0444a50e",
      "ogs:core:benchmark-task:xplain-009": "c077c498ceed5f73",
      "ogs:core:benchmark-task:xplain-010": "956c6ce654c2e297",
      "ogs:core:benchmark-task:xplain-011": "1dfeb782059a4814",
      "ogs:core:benchmark-task:xplain-012": "28df36c8ae155697",
      "ogs:core:benchmark-task:xplain-013": "18a20e7e30844c41",
      "ogs:core:benchmark-task:xplain-014": "f9be4ef2853d67b3",
      "ogs:core:benchmark-task:xplain-015": "5474f731e5d9751c",
      "ogs:core:benchmark-task:xplain-016": "fc7f227aebed6756",
      "ogs:core:benchmark-task:xplain-017": "024198497e327bb1",
      "ogs:core:benchmark-task:xplain-018": "283bbb9e703e6e30",
      "ogs:core:benchmark-task:xplain-019": "9ea6cd79384f9ee2",
      "ogs:core:benchmark-task:xplain-020": "43005f44c870684b"
    }
  },
  "tasks": [
    {
      "task_id": "ogs:core:benchmark-task:comp-001",
      "category": "composition",
      "capability": "maillard_index",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.maillard_index"
      ],
      "prompt_template_hash": "5dc05e883ee75cf7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Maillard index (0-5 integer)\n\n- 0: no browning (raw, poached, steamed)\n- 1: pale gold (soft-baked dough, lightly sauteed)\n- 2: golden brown (roasted chicken skin)\n- 3: deep mahogany (well-seared steak crust, duck breast skin)\n- 4: very dark brown with bitter edge (heavy crust, blackened fish)\n- 5: fully charred (intentionally blackened, carbonised)\n\n---\n\nUsing the OGS Maillard index (0 = no browning, 5 = fully charred), return the integer value best describing the described cook. Respond with just the integer.\n\nINPUT:\n{\n  \"description\": \"a duck breast seared skin-side down until the skin is deep mahogany brown and crisp, but not black\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 3,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "3",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 3.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 471,
            "output_tokens": 5,
            "latency_ms": 1286.7,
            "usd": 0.001488,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "3",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 3.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 471,
            "output_tokens": 5,
            "latency_ms": 1247.8,
            "usd": 0.001488,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "3",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 3.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 471,
            "output_tokens": 5,
            "latency_ms": 2410.6,
            "usd": 0.001488,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 471.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1648.367,
        "usd_sum": 0.004464
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-002",
      "category": "composition",
      "capability": "maillard_index",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.maillard_index"
      ],
      "prompt_template_hash": "d5fef8891eee5391",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Maillard index (0-5 integer)\n\n- 0: no browning (raw, poached, steamed)\n- 1: pale gold (soft-baked dough, lightly sauteed)\n- 2: golden brown (roasted chicken skin)\n- 3: deep mahogany (well-seared steak crust, duck breast skin)\n- 4: very dark brown with bitter edge (heavy crust, blackened fish)\n- 5: fully charred (intentionally blackened, carbonised)\n\n---\n\nUsing the OGS Maillard index (0 = no browning, 5 = fully charred), return the integer value for the described cook. Respond with just the integer.\n\nINPUT:\n{\n  \"description\": \"a fresh egg poached in 82 C water for 4 minutes\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 461,
            "output_tokens": 5,
            "latency_ms": 737.4,
            "usd": 0.001458,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 461,
            "output_tokens": 5,
            "latency_ms": 1880.8,
            "usd": 0.001458,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 461,
            "output_tokens": 5,
            "latency_ms": 1192.1,
            "usd": 0.001458,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 461.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1270.1,
        "usd_sum": 0.004374
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-003",
      "category": "composition",
      "capability": "reduction_level",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.reduction_level"
      ],
      "prompt_template_hash": "7b7db2c313e1d55b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Reduction level (0-5 integer)\n\n- 0: no reduction (raw stock, unreduced liquid)\n- 1: light simmer reduction (~10%)\n- 2: modest reduction (~30%, nappe forming)\n- 3: substantial reduction (~50%, demi-glace territory)\n- 4: heavy reduction (~75%, glace-like, syrupy)\n- 5: near-complete reduction (glace de viande, paste)\n\n---\n\nUsing the OGS reduction level (0 = no reduction, 5 = glace-level), return the integer best describing the sauce. Respond with just the integer.\n\nINPUT:\n{\n  \"description\": \"a veal demi-glace reduced from 4 L of stock down to about 600 mL, thickly coating a spoon with a glossy sheen\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 4,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "4",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 4.0,
            "expected": 4.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 461,
            "output_tokens": 5,
            "latency_ms": 793.6,
            "usd": 0.001458,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "4",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 4.0,
            "expected": 4.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 461,
            "output_tokens": 5,
            "latency_ms": 819.7,
            "usd": 0.001458,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "4",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 4.0,
            "expected": 4.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 461,
            "output_tokens": 5,
            "latency_ms": 886.8,
            "usd": 0.001458,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 461.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 833.367,
        "usd_sum": 0.004374
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-004",
      "category": "composition",
      "capability": "cooking_method_recognition",
      "difficulty": "hard",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "ca8109a73af762fa",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nA piece of black cod is packed in salt-sugar-sake-lees paste and kept refrigerated for 48 hours, then rinsed and grilled over binchotan. What is the single OGS cooking method that best labels the BEFORE-grilling stage? Respond with just the method name.\n\nINPUT:\n{\n  \"description\": \"black cod packed in salt-sugar-sake-lees paste for 48h before grilling\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "cured"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "cured",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cured",
            "expected": "cured",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 497,
            "output_tokens": 5,
            "latency_ms": 1259.6,
            "usd": 0.001566,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "cured",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cured",
            "expected": "cured",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 497,
            "output_tokens": 5,
            "latency_ms": 1020.7,
            "usd": 0.001566,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "cured",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cured",
            "expected": "cured",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 497,
            "output_tokens": 5,
            "latency_ms": 1042.5,
            "usd": 0.001566,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 497.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1107.6,
        "usd_sum": 0.004698
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-005",
      "category": "composition",
      "capability": "component_role_assignment",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "2885728399552fb5",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nFor the described dish element, return the single OGS component role that best applies. Respond with just the role name.\n\nINPUT:\n{\n  \"element\": \"a small handful of thin, deep-fried shallot rings scattered on top of a braised beef rendang primarily for crunchy contrast\"\n}\n\nANSWER FORMAT: one_of:main_protein,main_vegetable,starch,sauce,fat_component,acid_component,aromatic_garnish,seasoning,textural_element",
      "expected": {
        "value": "textural_element"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "textural_element",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "textural_element",
            "expected": "textural_element",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 7,
            "latency_ms": 1136.1,
            "usd": 0.001329,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "textural_element",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "textural_element",
            "expected": "textural_element",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 7,
            "latency_ms": 1481.5,
            "usd": 0.001329,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "textural_element",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "textural_element",
            "expected": "textural_element",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 7,
            "latency_ms": 971.3,
            "usd": 0.001329,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 408.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1196.3,
        "usd_sum": 0.003987
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-006",
      "category": "composition",
      "capability": "component_set_identification",
      "difficulty": "hard",
      "cuisine_region": "italian",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "bb37bf992b2f6c39",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nList the OGS component ROLES that appear in a classic Roman spaghetti alla carbonara (guanciale, eggs, pecorino romano, spaghetti, black pepper). Respond with a JSON array of role strings from the OGS vocabulary, without duplicates, in any order.\n\nINPUT:\n{\n  \"dish\": \"classic Roman spaghetti alla carbonara\",\n  \"components_present\": [\n    \"spaghetti\",\n    \"guanciale\",\n    \"egg yolks\",\n    \"pecorino romano\",\n    \"black pepper\"\n  ]\n}\n\nANSWER FORMAT: json\n\nRESPONSE JSON SCHEMA:\n{\n  \"type\": \"array\",\n  \"items\": {\n    \"type\": \"string\",\n    \"enum\": [\n      \"main_protein\",\n      \"main_vegetable\",\n      \"starch\",\n      \"sauce\",\n      \"fat_component\",\n      \"acid_component\",\n      \"aromatic_garnish\",\n      \"seasoning\",\n      \"textural_element\"\n    ]\n  },\n  \"uniqueItems\": true\n}",
      "expected": {
        "values": [
          "starch",
          "main_protein",
          "fat_component",
          "sauce",
          "seasoning"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"starch\", \"main_protein\", \"sauce\", \"fat_component\", \"seasoning\", \"textural_element\"]",
          "compliant": true,
          "score": 0.9091,
          "detail": {
            "predicted": [
              "fat_component",
              "main_protein",
              "sauce",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "expected": [
              "fat_component",
              "main_protein",
              "sauce",
              "seasoning",
              "starch"
            ],
            "precision": 0.8333,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 552,
            "output_tokens": 30,
            "latency_ms": 4643.6,
            "usd": 0.002106,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"starch\", \"main_protein\", \"sauce\", \"fat_component\", \"seasoning\", \"textural_element\"]",
          "compliant": true,
          "score": 0.9091,
          "detail": {
            "predicted": [
              "fat_component",
              "main_protein",
              "sauce",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "expected": [
              "fat_component",
              "main_protein",
              "sauce",
              "seasoning",
              "starch"
            ],
            "precision": 0.8333,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 552,
            "output_tokens": 30,
            "latency_ms": 1327.1,
            "usd": 0.002106,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"starch\", \"main_protein\", \"sauce\", \"fat_component\", \"seasoning\", \"textural_element\"]",
          "compliant": true,
          "score": 0.9091,
          "detail": {
            "predicted": [
              "fat_component",
              "main_protein",
              "sauce",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "expected": [
              "fat_component",
              "main_protein",
              "sauce",
              "seasoning",
              "starch"
            ],
            "precision": 0.8333,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 552,
            "output_tokens": 30,
            "latency_ms": 1302.5,
            "usd": 0.002106,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.9091,
      "score_summary": {
        "mean": 0.9091,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.9091,
        "ci95_hi": 0.9091,
        "n": 3
      },
      "score_if_valid": 0.9091,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 552.0,
        "output_tokens_mean": 30.0,
        "latency_ms_mean": 2424.4,
        "usd_sum": 0.006318
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-007",
      "category": "composition",
      "capability": "service_temperature",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.service_temperatures"
      ],
      "prompt_template_hash": "59ae41132f38bbfd",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Service temperatures\n\nfrozen, cold, cool, room, warm, hot, very_hot\n\n---\n\nReturn the single OGS service_temperature value that matches the traditional service of the dish below. Respond with just the value.\n\nINPUT:\n{\n  \"dish\": \"classic French vichyssoise (leek and potato soup)\"\n}\n\nANSWER FORMAT: one_of:frozen,cold,cool,room,warm,hot,very_hot",
      "expected": {
        "value": "cold"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 351,
            "output_tokens": 4,
            "latency_ms": 833.1,
            "usd": 0.001113,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 351,
            "output_tokens": 4,
            "latency_ms": 976.6,
            "usd": 0.001113,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 351,
            "output_tokens": 4,
            "latency_ms": 712.4,
            "usd": 0.001113,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 351.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 840.7,
        "usd_sum": 0.003339
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-008",
      "category": "composition",
      "capability": "maillard_index",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.maillard_index"
      ],
      "prompt_template_hash": "5121ce4bf46aa4d5",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Maillard index (0-5 integer)\n\n- 0: no browning (raw, poached, steamed)\n- 1: pale gold (soft-baked dough, lightly sauteed)\n- 2: golden brown (roasted chicken skin)\n- 3: deep mahogany (well-seared steak crust, duck breast skin)\n- 4: very dark brown with bitter edge (heavy crust, blackened fish)\n- 5: fully charred (intentionally blackened, carbonised)\n\n---\n\nUsing the OGS Maillard index (0 = no browning, 5 = fully charred), return the integer for a steamed dim sum dumpling wrapper.\n\nINPUT:\n{\n  \"description\": \"translucent wheat-starch wrapper on a har gow, bamboo-steamed\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 461,
            "output_tokens": 5,
            "latency_ms": 999.4,
            "usd": 0.001458,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 461,
            "output_tokens": 5,
            "latency_ms": 1037.3,
            "usd": 0.001458,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 461,
            "output_tokens": 5,
            "latency_ms": 1158.3,
            "usd": 0.001458,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 461.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1065.0,
        "usd_sum": 0.004374
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-009",
      "category": "composition",
      "capability": "maillard_index",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.maillard_index"
      ],
      "prompt_template_hash": "2a8a86bb58372ed7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Maillard index (0-5 integer)\n\n- 0: no browning (raw, poached, steamed)\n- 1: pale gold (soft-baked dough, lightly sauteed)\n- 2: golden brown (roasted chicken skin)\n- 3: deep mahogany (well-seared steak crust, duck breast skin)\n- 4: very dark brown with bitter edge (heavy crust, blackened fish)\n- 5: fully charred (intentionally blackened, carbonised)\n\n---\n\nUsing the OGS Maillard index (0-5 integer), return the value for a chicken breast poached in stock at 82°C with no prior searing.\n\nINPUT:\n{\n  \"description\": \"chicken breast poached in 82°C stock, never seared\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 0,
        "tolerance": 0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 456,
            "output_tokens": 5,
            "latency_ms": 913.7,
            "usd": 0.001443,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 456,
            "output_tokens": 5,
            "latency_ms": 1890.7,
            "usd": 0.001443,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "0",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 0.0,
            "expected": 0.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 456,
            "output_tokens": 5,
            "latency_ms": 828.4,
            "usd": 0.001443,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 456.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1210.933,
        "usd_sum": 0.004329
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-010",
      "category": "composition",
      "capability": "maillard_index",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.maillard_index"
      ],
      "prompt_template_hash": "d420afcaea603a0e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Maillard index (0-5 integer)\n\n- 0: no browning (raw, poached, steamed)\n- 1: pale gold (soft-baked dough, lightly sauteed)\n- 2: golden brown (roasted chicken skin)\n- 3: deep mahogany (well-seared steak crust, duck breast skin)\n- 4: very dark brown with bitter edge (heavy crust, blackened fish)\n- 5: fully charred (intentionally blackened, carbonised)\n\n---\n\nUsing the OGS Maillard index, return the integer for a scallop seared in hot butter until a deep golden crust forms on one side.\n\nINPUT:\n{\n  \"description\": \"diver scallop seared in foaming butter, one side deep golden, interior translucent\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 3,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "3",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 3.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 459,
            "output_tokens": 5,
            "latency_ms": 746.1,
            "usd": 0.001452,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "2",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 2.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 459,
            "output_tokens": 5,
            "latency_ms": 707.9,
            "usd": 0.001452,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "3",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 3.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 459,
            "output_tokens": 5,
            "latency_ms": 1838.7,
            "usd": 0.001452,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": 0.0133,
        "ci95_hi": 1.32,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 459.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1097.567,
        "usd_sum": 0.004356
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-011",
      "category": "composition",
      "capability": "reduction_level",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.reduction_level"
      ],
      "prompt_template_hash": "1d18b06eac9f3ddc",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Reduction level (0-5 integer)\n\n- 0: no reduction (raw stock, unreduced liquid)\n- 1: light simmer reduction (~10%)\n- 2: modest reduction (~30%, nappe forming)\n- 3: substantial reduction (~50%, demi-glace territory)\n- 4: heavy reduction (~75%, glace-like, syrupy)\n- 5: near-complete reduction (glace de viande, paste)\n\n---\n\nUsing the OGS reduction level (0-5 integer), return the value for a pan sauce reduced until it coats the back of a spoon (nappe).\n\nINPUT:\n{\n  \"description\": \"red wine pan sauce reduced until it coats the back of a spoon without running\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 3,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "2",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 2.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 438,
            "output_tokens": 5,
            "latency_ms": 1001.2,
            "usd": 0.001389,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "2",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 2.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 438,
            "output_tokens": 5,
            "latency_ms": 911.8,
            "usd": 0.001389,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "2",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 2.0,
            "expected": 3.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 438,
            "output_tokens": 5,
            "latency_ms": 1032.5,
            "usd": 0.001389,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 438.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 981.833,
        "usd_sum": 0.004167
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-012",
      "category": "composition",
      "capability": "reduction_level",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.reduction_level"
      ],
      "prompt_template_hash": "fa91f74e86a4cffb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Reduction level (0-5 integer)\n\n- 0: no reduction (raw stock, unreduced liquid)\n- 1: light simmer reduction (~10%)\n- 2: modest reduction (~30%, nappe forming)\n- 3: substantial reduction (~50%, demi-glace territory)\n- 4: heavy reduction (~75%, glace-like, syrupy)\n- 5: near-complete reduction (glace de viande, paste)\n\n---\n\nUsing the OGS reduction level, return the integer for glace de viande (gelatinous concentrate that sets firmly at room temperature).\n\nINPUT:\n{\n  \"description\": \"glace de viande, reduced from 10 L brown stock to ~500 mL\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 5,
        "tolerance": 0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 5.0,
            "expected": 5.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 435,
            "output_tokens": 5,
            "latency_ms": 749.1,
            "usd": 0.00138,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 5.0,
            "expected": 5.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 435,
            "output_tokens": 5,
            "latency_ms": 730.2,
            "usd": 0.00138,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 5.0,
            "expected": 5.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 435,
            "output_tokens": 5,
            "latency_ms": 1070.5,
            "usd": 0.00138,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 435.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 849.933,
        "usd_sum": 0.00414
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-013",
      "category": "composition",
      "capability": "cooking_method_recognition",
      "difficulty": "medium",
      "cuisine_region": "korean",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "f8257062be99b5d3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nReturn the single OGS cooking method that best fits the main cooking step for bossam pork belly (~90 min at low simmer in aromatic water).\n\nINPUT:\n{\n  \"description\": \"pork belly gently simmered for 90 minutes in water with ginger, doenjang, and pepper until fork-tender\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "poached"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "poached",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "poached",
            "expected": "poached",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 473,
            "output_tokens": 5,
            "latency_ms": 1234.2,
            "usd": 0.001494,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "poached",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "poached",
            "expected": "poached",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 473,
            "output_tokens": 5,
            "latency_ms": 1283.2,
            "usd": 0.001494,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "poached",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "poached",
            "expected": "poached",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 473,
            "output_tokens": 5,
            "latency_ms": 653.8,
            "usd": 0.001494,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 473.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1057.067,
        "usd_sum": 0.004482
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-014",
      "category": "composition",
      "capability": "component_role_assignment",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "114b0796c10cf141",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nReturn the single OGS component role that best fits a sprinkle of gremolata (parsley, garlic, lemon zest) finishing an osso buco at service.\n\nINPUT:\n{\n  \"description\": \"gremolata finishing an osso buco milanese at service\"\n}\n\nANSWER FORMAT: one_of:main_protein,main_vegetable,starch,sauce,fat_component,acid_component,aromatic_garnish,seasoning,textural_element",
      "expected": {
        "value": "aromatic_garnish"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "aromatic_garnish",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "aromatic_garnish",
            "expected": "aromatic_garnish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 407,
            "output_tokens": 8,
            "latency_ms": 1786.2,
            "usd": 0.001341,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "aromatic_garnish",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "aromatic_garnish",
            "expected": "aromatic_garnish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 407,
            "output_tokens": 8,
            "latency_ms": 1223.2,
            "usd": 0.001341,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "aromatic_garnish",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "aromatic_garnish",
            "expected": "aromatic_garnish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 407,
            "output_tokens": 8,
            "latency_ms": 907.2,
            "usd": 0.001341,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 407.0,
        "output_tokens_mean": 8.0,
        "latency_ms_mean": 1305.533,
        "usd_sum": 0.004023
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-015",
      "category": "composition",
      "capability": "component_set_identification",
      "difficulty": "hard",
      "cuisine_region": "thai",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "07976f0ee97244e5",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nList the OGS component roles that appear in a classic Pad Thai (rice noodles, shrimp, tofu, egg, tamarind-palm sugar-fish sauce, bean sprouts, crushed peanuts, chives). Return a JSON array of role names.\n\nINPUT:\n{\n  \"dish\": \"classic Pad Thai\"\n}\n\nANSWER FORMAT: set_of_roles",
      "expected": {
        "values": [
          "starch",
          "main_protein",
          "acid_component",
          "seasoning",
          "aromatic_garnish",
          "textural_element",
          "fat_component"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"main_protein\",\"starch\",\"sauce\",\"acid_component\",\"textural_element\",\"aromatic_garnish\",\"seasoning\"]",
          "compliant": true,
          "score": 0.8571,
          "detail": {
            "predicted": [
              "acid_component",
              "aromatic_garnish",
              "main_protein",
              "sauce",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "expected": [
              "acid_component",
              "aromatic_garnish",
              "fat_component",
              "main_protein",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "precision": 0.8571,
            "recall": 0.8571
          },
          "metrics": {
            "input_tokens": 383,
            "output_tokens": 31,
            "latency_ms": 1027.2,
            "usd": 0.001614,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"main_protein\", \"starch\", \"sauce\", \"textural_element\", \"aromatic_garnish\", \"seasoning\", \"acid_component\", \"fat_component\", \"main_vegetable\"]",
          "compliant": true,
          "score": 0.875,
          "detail": {
            "predicted": [
              "acid_component",
              "aromatic_garnish",
              "fat_component",
              "main_protein",
              "main_vegetable",
              "sauce",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "expected": [
              "acid_component",
              "aromatic_garnish",
              "fat_component",
              "main_protein",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "precision": 0.7778,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 383,
            "output_tokens": 48,
            "latency_ms": 1310.5,
            "usd": 0.001869,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"main_protein\",\"starch\",\"sauce\",\"acid_component\",\"textural_element\",\"aromatic_garnish\",\"seasoning\"]",
          "compliant": true,
          "score": 0.8571,
          "detail": {
            "predicted": [
              "acid_component",
              "aromatic_garnish",
              "main_protein",
              "sauce",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "expected": [
              "acid_component",
              "aromatic_garnish",
              "fat_component",
              "main_protein",
              "seasoning",
              "starch",
              "textural_element"
            ],
            "precision": 0.8571,
            "recall": 0.8571
          },
          "metrics": {
            "input_tokens": 383,
            "output_tokens": 31,
            "latency_ms": 1178.1,
            "usd": 0.001614,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.8631,
      "score_summary": {
        "mean": 0.8631,
        "stdev": 0.0103,
        "sem": 0.006,
        "ci95_lo": 0.8514,
        "ci95_hi": 0.8748,
        "n": 3
      },
      "score_if_valid": 0.8631,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 383.0,
        "output_tokens_mean": 36.667,
        "latency_ms_mean": 1171.933,
        "usd_sum": 0.005097
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-016",
      "category": "composition",
      "capability": "service_temperature",
      "difficulty": "easy",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.service_temperatures"
      ],
      "prompt_template_hash": "609a0643399a8df0",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Service temperatures\n\nfrozen, cold, cool, room, warm, hot, very_hot\n\n---\n\nReturn the single OGS service_temperature value for a Spanish ajo blanco (chilled almond-garlic soup).\n\nINPUT:\n{\n  \"dish\": \"ajo blanco, chilled Andalusian almond and garlic soup\"\n}\n\nANSWER FORMAT: one_of:frozen,cold,cool,room,warm,hot,very_hot",
      "expected": {
        "value": "cold"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 356,
            "output_tokens": 4,
            "latency_ms": 795.2,
            "usd": 0.001128,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 356,
            "output_tokens": 4,
            "latency_ms": 2859.2,
            "usd": 0.001128,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 356,
            "output_tokens": 4,
            "latency_ms": 846.9,
            "usd": 0.001128,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 356.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1500.433,
        "usd_sum": 0.003384
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-017",
      "category": "composition",
      "capability": "cooking_method_recognition",
      "difficulty": "medium",
      "cuisine_region": "chinese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "09a6e47eda264ded",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nReturn the single OGS cooking method that best describes the signature crisp skin of Peking duck (air-dried then hung-roasted in a wood-fired oven).\n\nINPUT:\n{\n  \"description\": \"Peking duck hung-roasted in a jujube-wood oven at ~230°C until the skin is lacquered and crisp\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "roasted"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "roasted",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "roasted",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 479,
            "output_tokens": 5,
            "latency_ms": 1060.7,
            "usd": 0.001512,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "roasted",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "roasted",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 479,
            "output_tokens": 5,
            "latency_ms": 927.5,
            "usd": 0.001512,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "roasted",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "roasted",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 479,
            "output_tokens": 5,
            "latency_ms": 941.2,
            "usd": 0.001512,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 479.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 976.467,
        "usd_sum": 0.004536
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-018",
      "category": "composition",
      "capability": "component_role_assignment",
      "difficulty": "medium",
      "cuisine_region": "levantine",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "32d4a921f4fbdd98",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nReturn the OGS component role that best fits tahini as an ingredient in a classic hummus recipe.\n\nINPUT:\n{\n  \"description\": \"25% tahini by weight blended into chickpea puree to make hummus\"\n}\n\nANSWER FORMAT: one_of:main_protein,main_vegetable,starch,sauce,fat_component,acid_component,aromatic_garnish,seasoning,textural_element",
      "expected": {
        "value": "fat_component"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "fat_component",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fat_component",
            "expected": "fat_component",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 392,
            "output_tokens": 6,
            "latency_ms": 1447.6,
            "usd": 0.001266,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "fat_component",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fat_component",
            "expected": "fat_component",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 392,
            "output_tokens": 6,
            "latency_ms": 1281.4,
            "usd": 0.001266,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "fat_component",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fat_component",
            "expected": "fat_component",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 392,
            "output_tokens": 6,
            "latency_ms": 1277.1,
            "usd": 0.001266,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 392.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 1335.367,
        "usd_sum": 0.003798
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-019",
      "category": "composition",
      "capability": "service_temperature",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.service_temperatures"
      ],
      "prompt_template_hash": "eaa27377a264e2e2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Service temperatures\n\nfrozen, cold, cool, room, warm, hot, very_hot\n\n---\n\nReturn the OGS service_temperature for a stirred-over-ice Negroni served in a rocks glass with a large cube.\n\nINPUT:\n{\n  \"dish\": \"stirred Negroni served over a single large ice cube\"\n}\n\nANSWER FORMAT: one_of:frozen,cold,cool,room,warm,hot,very_hot",
      "expected": {
        "value": "cold"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 355,
            "output_tokens": 4,
            "latency_ms": 901.3,
            "usd": 0.001125,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 355,
            "output_tokens": 4,
            "latency_ms": 1345.9,
            "usd": 0.001125,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "cold",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cold",
            "expected": "cold",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 355,
            "output_tokens": 4,
            "latency_ms": 751.0,
            "usd": 0.001125,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 355.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 999.4,
        "usd_sum": 0.003375
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:comp-020",
      "category": "composition",
      "capability": "maillard_index",
      "difficulty": "hard",
      "cuisine_region": "american",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.maillard_index"
      ],
      "prompt_template_hash": "535bf59391539a15",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Maillard index (0-5 integer)\n\n- 0: no browning (raw, poached, steamed)\n- 1: pale gold (soft-baked dough, lightly sauteed)\n- 2: golden brown (roasted chicken skin)\n- 3: deep mahogany (well-seared steak crust, duck breast skin)\n- 4: very dark brown with bitter edge (heavy crust, blackened fish)\n- 5: fully charred (intentionally blackened, carbonised)\n\n---\n\nUsing the OGS Maillard index, return the integer for blackened redfish cooked in a smoking-hot cast iron skillet with Cajun spice rub.\n\nINPUT:\n{\n  \"description\": \"redfish fillet crusted with spice and cooked in a 300°C dry cast-iron pan until the surface is blackened and slightly bitter\"\n}\n\nANSWER FORMAT: integer_0_5",
      "expected": {
        "value": 4,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "4",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 4.0,
            "expected": 4.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 474,
            "output_tokens": 5,
            "latency_ms": 1600.0,
            "usd": 0.001497,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "4",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 4.0,
            "expected": 4.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 474,
            "output_tokens": 5,
            "latency_ms": 1144.9,
            "usd": 0.001497,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "4",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 4.0,
            "expected": 4.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 474,
            "output_tokens": 5,
            "latency_ms": 1381.5,
            "usd": 0.001497,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 474.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1375.467,
        "usd_sum": 0.004491
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-001",
      "category": "dietary",
      "capability": "vegan_compatibility",
      "difficulty": "easy",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "18b5c9c7c958a521",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nIs a traditional Bolognese tagliatelle al ragù (tagliatelle with ragù made from beef, pancetta, soffritto, tomato, red wine, whole milk, and butter) compatible with a strict vegan diet? Answer with a single word: yes or no.\n\nINPUT:\n{\n  \"dish\": \"tagliatelle al ragù\",\n  \"diet\": \"vegan\"\n}\n\nANSWER FORMAT: exact:yes|no",
      "expected": {
        "value": "no"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "no",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "no",
            "expected": "no",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 1056,
            "output_tokens": 4,
            "latency_ms": 885.6,
            "usd": 0.003228,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "no",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "no",
            "expected": "no",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 1056,
            "output_tokens": 4,
            "latency_ms": 994.2,
            "usd": 0.003228,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "no",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "no",
            "expected": "no",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 1056,
            "output_tokens": 4,
            "latency_ms": 1886.5,
            "usd": 0.003228,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1056.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1255.433,
        "usd_sum": 0.009684
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-002",
      "category": "dietary",
      "capability": "gluten_free_violation",
      "difficulty": "easy",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "04840e46f80ccb5b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nA kitchen claims this stir-fry is gluten-free: rice noodles, tofu, shiitake, broccoli, garlic, ginger, regular Japanese soy sauce, sesame oil, and scallions. Return the single ingredient that breaks the claim.\n\nINPUT:\n{\n  \"dish\": \"stir fry\",\n  \"ingredients\": [\n    \"rice noodles\",\n    \"tofu\",\n    \"shiitake\",\n    \"broccoli\",\n    \"garlic\",\n    \"ginger\",\n    \"regular Japanese soy sauce\",\n    \"sesame oil\",\n    \"scallions\"\n  ]\n}\n\nANSWER FORMAT: exact:string",
      "expected": {
        "value": "regular Japanese soy sauce",
        "aliases": [
          "soy sauce",
          "regular soy sauce"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "regular Japanese soy sauce",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "regular japanese soy sauce",
            "expected": "regular japanese soy sauce",
            "aliases": [
              "soy sauce",
              "regular soy sauce"
            ]
          },
          "metrics": {
            "input_tokens": 1112,
            "output_tokens": 8,
            "latency_ms": 809.0,
            "usd": 0.003456,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "regular Japanese soy sauce",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "regular japanese soy sauce",
            "expected": "regular japanese soy sauce",
            "aliases": [
              "soy sauce",
              "regular soy sauce"
            ]
          },
          "metrics": {
            "input_tokens": 1112,
            "output_tokens": 8,
            "latency_ms": 675.9,
            "usd": 0.003456,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "regular Japanese soy sauce",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "regular japanese soy sauce",
            "expected": "regular japanese soy sauce",
            "aliases": [
              "soy sauce",
              "regular soy sauce"
            ]
          },
          "metrics": {
            "input_tokens": 1112,
            "output_tokens": 8,
            "latency_ms": 718.6,
            "usd": 0.003456,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1112.0,
        "output_tokens_mean": 8.0,
        "latency_ms_mean": 734.5,
        "usd_sum": 0.010368
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-003",
      "category": "dietary",
      "capability": "halal_compliance",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "013d6295fee472b5",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nIs a classic panna cotta (cream, sugar, vanilla, leaf gelatine) halal? Answer with yes or no.\n\nINPUT:\n{\n  \"dish\": \"panna cotta\",\n  \"ingredients\": [\n    \"cream\",\n    \"sugar\",\n    \"vanilla\",\n    \"leaf gelatine\"\n  ]\n}\n\nANSWER FORMAT: exact:yes|no",
      "expected": {
        "value": "no"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "no",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "no",
            "expected": "no",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 1041,
            "output_tokens": 4,
            "latency_ms": 722.4,
            "usd": 0.003183,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "no",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "no",
            "expected": "no",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 1041,
            "output_tokens": 4,
            "latency_ms": 813.5,
            "usd": 0.003183,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "no",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "no",
            "expected": "no",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 1041,
            "output_tokens": 4,
            "latency_ms": 813.9,
            "usd": 0.003183,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1041.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 783.267,
        "usd_sum": 0.009549
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-004",
      "category": "dietary",
      "capability": "kosher_meat_dairy",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "85fc94958c4357ef",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nChicken parmigiana is breaded chicken breast topped with tomato sauce and melted mozzarella, served with a side of cream-rich garlic mashed potatoes. From a kosher perspective, what is the core violation? Respond with one short phrase.\n\nINPUT:\n{\n  \"dish\": \"chicken parmigiana with creamy mash\"\n}\n\nANSWER FORMAT: exact:string",
      "expected": {
        "value": "mixing meat and dairy",
        "aliases": [
          "meat and dairy mixed",
          "basar b'chalav",
          "poultry with dairy",
          "meat with dairy"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "mixing meat and dairy",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "mixing meat and dairy",
            "expected": "mixing meat and dairy",
            "aliases": [
              "meat and dairy mixed",
              "basar b'chalav",
              "poultry with dairy",
              "meat with dairy"
            ]
          },
          "metrics": {
            "input_tokens": 1043,
            "output_tokens": 7,
            "latency_ms": 1030.9,
            "usd": 0.003234,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "mixing meat and dairy",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "mixing meat and dairy",
            "expected": "mixing meat and dairy",
            "aliases": [
              "meat and dairy mixed",
              "basar b'chalav",
              "poultry with dairy",
              "meat with dairy"
            ]
          },
          "metrics": {
            "input_tokens": 1043,
            "output_tokens": 7,
            "latency_ms": 967.2,
            "usd": 0.003234,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "mixing meat and dairy",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "mixing meat and dairy",
            "expected": "mixing meat and dairy",
            "aliases": [
              "meat and dairy mixed",
              "basar b'chalav",
              "poultry with dairy",
              "meat with dairy"
            ]
          },
          "metrics": {
            "input_tokens": 1043,
            "output_tokens": 7,
            "latency_ms": 971.2,
            "usd": 0.003234,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1043.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 989.767,
        "usd_sum": 0.009702
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-005",
      "category": "dietary",
      "capability": "allergen_detection",
      "difficulty": "easy",
      "cuisine_region": "italian",
      "scorer": "set_f1",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "f40f4c25a2ee4b71",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nFor a dish of pesto genovese made from basil, garlic, pine nuts, parmesan, pecorino, and olive oil, list the EU Annex II allergens that must be declared. Return a comma-separated list using the OGS allergen IDs.\n\nINPUT:\n{\n  \"dish\": \"pesto genovese\",\n  \"ingredients\": [\n    \"basil\",\n    \"garlic\",\n    \"pine nuts\",\n    \"parmesan\",\n    \"pecorino\",\n    \"olive oil\"\n  ]\n}\n\nANSWER FORMAT: set:comma-separated OGS allergen IDs",
      "expected": {
        "values": [
          "tree_nuts",
          "milk"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "set:tree_nuts,milk",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": [
              "milk",
              "set:tree_nuts"
            ],
            "expected": [
              "milk",
              "tree_nuts"
            ],
            "precision": 0.5,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1098,
            "output_tokens": 10,
            "latency_ms": 6075.1,
            "usd": 0.003444,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "tree_nuts,milk",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": [
              "milk",
              "tree_nuts"
            ],
            "expected": [
              "milk",
              "tree_nuts"
            ],
            "precision": 1.0,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1098,
            "output_tokens": 8,
            "latency_ms": 895.6,
            "usd": 0.003414,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "set:tree_nuts,milk",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": [
              "milk",
              "set:tree_nuts"
            ],
            "expected": [
              "milk",
              "tree_nuts"
            ],
            "precision": 0.5,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1098,
            "output_tokens": 10,
            "latency_ms": 1045.4,
            "usd": 0.003444,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.2887,
        "sem": 0.1667,
        "ci95_lo": 0.34,
        "ci95_hi": 0.9933,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1098.0,
        "output_tokens_mean": 9.333,
        "latency_ms_mean": 2672.033,
        "usd_sum": 0.010302
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-006",
      "category": "dietary",
      "capability": "allergen_detection",
      "difficulty": "medium",
      "cuisine_region": "thai",
      "scorer": "set_f1",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "7f45e388301a68a5",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nReturn the EU Annex II allergens that must be declared for a standard pad thai (rice noodles, shrimp, egg, tamarind, fish sauce, palm sugar, roasted peanuts, bean sprouts, garlic chives, lime, chili).\n\nINPUT:\n{\n  \"dish\": \"pad thai\"\n}\n\nANSWER FORMAT: set:comma-separated OGS allergen IDs",
      "expected": {
        "values": [
          "crustaceans",
          "eggs",
          "fish",
          "peanuts"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "eggs,fish,peanuts,crustaceans,soybeans\n\nWait, let me reconsider. Fish sauce contains fish. Shrimp are crustaceans. Eggs are present. Peanuts are present. \n\nDoes fish sauce contain soybeans? Standard fish sauce is just fish and salt - no soybeans. Let me reconsider soy.\n\nThe ingredients listed are: rice noodles, shrimp, egg, tamarind, fish sauce, palm sugar, roasted peanuts, bean sprouts, garlic chives, lime, chili.\n\n- shrimp → `crustaceans`\n- egg → `eggs`\n- fish sauce → `fish`\n- roasted peanuts → `peanuts`\n\nNo soybeans, no gluten, no tree nuts, no milk, no celery, no mustard, no sesame, no sulphur dioxide, no lupin, no molluscs.\n\ncrustaceans,eggs,fish,peanuts",
          "compliant": true,
          "score": 0.2162,
          "detail": {
            "predicted": [
              "- egg → `eggs",
              "- fish sauce → `fish",
              "- roasted peanuts → `peanuts",
              "- shrimp → `crustaceans",
              "Does fish sauce contain soybeans? Standard fish sauce is just fish and salt - no soybeans. Let me reconsider soy.",
              "No soybeans",
              "The ingredients listed are: rice noodles",
              "Wait",
              "bean sprouts",
              "chili.",
              "crustaceans",
              "egg",
              "eggs",
              "fish",
              "fish sauce",
              "garlic chives",
              "let me reconsider. Fish sauce contains fish. Shrimp are crustaceans. Eggs are present. Peanuts are present.",
              "lime",
              "no celery",
              "no gluten",
              "no lupin",
              "no milk",
              "no molluscs.",
              "no mustard",
              "no sesame",
              "no sulphur dioxide",
              "no tree nuts",
              "palm sugar",
              "peanuts",
              "roasted peanuts",
              "shrimp",
              "soybeans",
              "tamarind"
            ],
            "expected": [
              "crustaceans",
              "eggs",
              "fish",
              "peanuts"
            ],
            "precision": 0.1212,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1046,
            "output_tokens": 231,
            "latency_ms": 4229.2,
            "usd": 0.006603,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "eggs,fish,peanuts,crustaceans,soybeans\n\nWait, let me reconsider. Fish sauce contains fish. Shrimp are crustaceans. Eggs are present. Peanuts are present. \n\nDoes fish sauce contain soybeans? Standard fish sauce is just fish and salt - no soybeans. Bean sprouts are typically mung bean sprouts - not soybeans specifically. So soybeans should not be declared.\n\neggs,fish,peanuts,crustaceans",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "Does fish sauce contain soybeans? Standard fish sauce is just fish and salt - no soybeans. Bean sprouts are typically mung bean sprouts - not soybeans specifically. So soybeans should not be declared.",
              "Wait",
              "crustaceans",
              "eggs",
              "fish",
              "let me reconsider. Fish sauce contains fish. Shrimp are crustaceans. Eggs are present. Peanuts are present.",
              "peanuts",
              "soybeans"
            ],
            "expected": [
              "crustaceans",
              "eggs",
              "fish",
              "peanuts"
            ],
            "precision": 0.5,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1046,
            "output_tokens": 118,
            "latency_ms": 2919.1,
            "usd": 0.004908,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Looking at each ingredient:\n\n- Rice noodles: no allergens\n- Shrimp: `crustaceans`\n- Egg: `eggs`\n- Tamarind: no allergens\n- Fish sauce: `fish`\n- Palm sugar: no allergens\n- Roasted peanuts: `peanuts`\n- Bean sprouts: no allergens\n- Garlic chives: no allergens\n- Lime: no allergens\n- Chili: no allergens\n\nset:crustaceans,eggs,fish,peanuts",
          "compliant": true,
          "score": 0.3,
          "detail": {
            "predicted": [
              "- Bean sprouts: no allergens",
              "- Chili: no allergens",
              "- Egg: `eggs",
              "- Fish sauce: `fish",
              "- Garlic chives: no allergens",
              "- Lime: no allergens",
              "- Palm sugar: no allergens",
              "- Rice noodles: no allergens",
              "- Roasted peanuts: `peanuts",
              "- Shrimp: `crustaceans",
              "- Tamarind: no allergens",
              "Looking at each ingredient:",
              "eggs",
              "fish",
              "peanuts",
              "set:crustaceans"
            ],
            "expected": [
              "crustaceans",
              "eggs",
              "fish",
              "peanuts"
            ],
            "precision": 0.1875,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1046,
            "output_tokens": 131,
            "latency_ms": 6019.7,
            "usd": 0.005103,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.3943,
      "score_summary": {
        "mean": 0.3943,
        "stdev": 0.2396,
        "sem": 0.1383,
        "ci95_lo": 0.1232,
        "ci95_hi": 0.6654,
        "n": 3
      },
      "score_if_valid": 0.3943,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1046.0,
        "output_tokens_mean": 160.0,
        "latency_ms_mean": 4389.333,
        "usd_sum": 0.016614
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-007",
      "category": "dietary",
      "capability": "vegan_substitution",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "eb092a2c162960f6",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nName a single plant-based ingredient commonly used to emulsify and thicken a vegan carbonara sauce in place of eggs, while also delivering umami. Respond with one short ingredient name.\n\nINPUT:\n{\n  \"target_role\": \"egg emulsifier and umami in vegan carbonara\"\n}\n\nANSWER FORMAT: exact:string",
      "expected": {
        "value": "silken tofu",
        "aliases": [
          "silken tofu blended with nutritional yeast",
          "soft tofu"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "cashew cream doesn't deliver umami well. The best answer here is:\n\nsilken tofu\n\nWait, but silken tofu doesn't have strong umami either. Let me think about what delivers both emulsification/thickening AND umami in vegan carbonara.\n\nThe ingredient that is commonly used as an egg substitute for emulsification while also delivering umami is:\n\nnutritional yeast\n\nBut nutritional yeast doesn't emulsify on its own...\n\nThe most common ingredient used specifically for this dual purpose in vegan carbonara recipes is **silken tofu** (emulsifies/thickens) combined with nutritional yeast (umami), but if a single ingredient is needed:\n\nThe answer is **white miso** — it emulsifies and delivers umami. But it's not typically the primary thickener.\n\nActually, the most commonly cited single ingredient for vegan carbonara that both emulsifies and provides umami is:\n\ncashew cream with nutritional yeast... but that's two.\n\nThe single ingredient answer most commonly given is:\n\nsilken tofu\n\nsilken tofu",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "cashew cream doesn't deliver umami well. the best answer here is: silken tofu wait, but silken tofu doesn't have strong umami either. let me think about what delivers both emulsification/thickening and umami in vegan carbonara. the ingredient that is commonly used as an egg substitute for emulsification while also delivering umami is: nutritional yeast but nutritional yeast doesn't emulsify on its own... the most common ingredient used specifically for this dual purpose in vegan carbonara recipes is **silken tofu** (emulsifies/thickens) combined with nutritional yeast (umami), but if a single ingredient is needed: the answer is **white miso** — it emulsifies and delivers umami. but it's not typically the primary thickener. actually, the most commonly cited single ingredient for vegan carbonara that both emulsifies and provides umami is: cashew cream with nutritional yeast... but that's two. the single ingredient answer most commonly given is: silken tofu silken tofu",
            "expected": "silken tofu",
            "aliases": [
              "silken tofu blended with nutritional yeast",
              "soft tofu"
            ]
          },
          "metrics": {
            "input_tokens": 1031,
            "output_tokens": 246,
            "latency_ms": 5990.2,
            "usd": 0.006783,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "cashew cream doesn't deliver umami... Let me think about this carefully.\n\nThe ingredient needs to:\n1. Emulsify and thicken (like eggs do in carbonara)\n2. Deliver umami\n3. Be plant-based/vegan\n\nSilken tofu can emulsify and has some umami. Nutritional yeast delivers umami but doesn't emulsify well alone. \n\nThe most common answer for vegan carbonara that combines both functions is **silken tofu** — it emulsifies/thickens the sauce and provides some umami depth.\n\nsilken tofu",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "cashew cream doesn't deliver umami... let me think about this carefully. the ingredient needs to: 1. emulsify and thicken (like eggs do in carbonara) 2. deliver umami 3. be plant-based/vegan silken tofu can emulsify and has some umami. nutritional yeast delivers umami but doesn't emulsify well alone. the most common answer for vegan carbonara that combines both functions is **silken tofu** — it emulsifies/thickens the sauce and provides some umami depth. silken tofu",
            "expected": "silken tofu",
            "aliases": [
              "silken tofu blended with nutritional yeast",
              "soft tofu"
            ]
          },
          "metrics": {
            "input_tokens": 1031,
            "output_tokens": 139,
            "latency_ms": 3592.5,
            "usd": 0.005178,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "cashew cream doesn't deliver umami well. The ingredient that both emulsifies/thickens AND delivers umami in vegan carbonara is:\n\nnutritional yeast\n\nWait — nutritional yeast delivers umami and cheesy flavor but isn't primarily an emulsifier/thickener. The classic vegan carbonara uses **cashew cream** for emulsification and thickness, but for umami that's typically nutritional yeast or miso.\n\nThe single ingredient that can serve all three roles (emulsify, thicken, and deliver umami) is:\n\nwhite miso\n\nActually, the most commonly cited single ingredient in vegan carbonara recipes that emulsifies, thickens, AND delivers umami is:\n\n**cashew cream** (with nutritional yeast added separately)\n\nBut if forced to one ingredient delivering all three: **silken tofu** blended with nutritional yeast... \n\nThe best single answer fitting all criteria is:\n\nnutritional yeast\n\nNo — let me settle this: the ingredient most commonly used as the egg replacer (emulsifier + thickener) **and** umami source combined in vegan carbonara is:\n\n**silken tofu**\n\n(blended, it emulsifies and thickens; fermented/seasoned versions add umami)\n\nsilken tofu",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "cashew cream doesn't deliver umami well. the ingredient that both emulsifies/thickens and delivers umami in vegan carbonara is: nutritional yeast wait — nutritional yeast delivers umami and cheesy flavor but isn't primarily an emulsifier/thickener. the classic vegan carbonara uses **cashew cream** for emulsification and thickness, but for umami that's typically nutritional yeast or miso. the single ingredient that can serve all three roles (emulsify, thicken, and deliver umami) is: white miso actually, the most commonly cited single ingredient in vegan carbonara recipes that emulsifies, thickens, and delivers umami is: **cashew cream** (with nutritional yeast added separately) but if forced to one ingredient delivering all three: **silken tofu** blended with nutritional yeast... the best single answer fitting all criteria is: nutritional yeast no — let me settle this: the ingredient most commonly used as the egg replacer (emulsifier + thickener) **and** umami source combined in vegan carbonara is: **silken tofu** (blended, it emulsifies and thickens; fermented/seasoned versions add umami) silken tofu",
            "expected": "silken tofu",
            "aliases": [
              "silken tofu blended with nutritional yeast",
              "soft tofu"
            ]
          },
          "metrics": {
            "input_tokens": 1031,
            "output_tokens": 294,
            "latency_ms": 7101.7,
            "usd": 0.007503,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1031.0,
        "output_tokens_mean": 226.333,
        "latency_ms_mean": 5561.467,
        "usd_sum": 0.019464
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-008",
      "category": "dietary",
      "capability": "gluten_free_substitution",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "480be99b3f35a51d",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nSuggest a single widely-used gluten-free flour to make a béchamel roux that maintains classic texture. Respond with one word.\n\nINPUT:\n{\n  \"target_role\": \"béchamel roux, gluten-free\"\n}\n\nANSWER FORMAT: exact:string",
      "expected": {
        "value": "rice flour",
        "aliases": [
          "cornstarch",
          "corn starch",
          "cornflour",
          "tapioca starch",
          "tapioca flour"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "rice",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "rice",
            "expected": "rice flour",
            "aliases": [
              "cornstarch",
              "corn starch",
              "cornflour",
              "tapioca starch",
              "tapioca flour"
            ]
          },
          "metrics": {
            "input_tokens": 1020,
            "output_tokens": 4,
            "latency_ms": 1328.0,
            "usd": 0.00312,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "rice",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "rice",
            "expected": "rice flour",
            "aliases": [
              "cornstarch",
              "corn starch",
              "cornflour",
              "tapioca starch",
              "tapioca flour"
            ]
          },
          "metrics": {
            "input_tokens": 1020,
            "output_tokens": 4,
            "latency_ms": 990.7,
            "usd": 0.00312,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "rice",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "rice",
            "expected": "rice flour",
            "aliases": [
              "cornstarch",
              "corn starch",
              "cornflour",
              "tapioca starch",
              "tapioca flour"
            ]
          },
          "metrics": {
            "input_tokens": 1020,
            "output_tokens": 4,
            "latency_ms": 715.5,
            "usd": 0.00312,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1020.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1011.4,
        "usd_sum": 0.00936
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-009",
      "category": "dietary",
      "capability": "vegan_compatibility",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "set_f1",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "936752891554dba3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nFrom this list, return every item that is NOT vegan as a comma-separated list: honey, agave nectar, wine clarified with isinglass, wine clarified with bentonite, carmine, nutritional yeast, shellac, maple syrup, worcestershire sauce (traditional), vegetable glycerin.\n\nINPUT:\n{\n  \"items\": [\n    \"honey\",\n    \"agave nectar\",\n    \"wine clarified with isinglass\",\n    \"wine clarified with bentonite\",\n    \"carmine\",\n    \"nutritional yeast\",\n    \"shellac\",\n    \"maple syrup\",\n    \"worcestershire sauce (traditional)\",\n    \"vegetable glycerin\"\n  ]\n}\n\nANSWER FORMAT: set:comma-separated item names",
      "expected": {
        "values": [
          "honey",
          "wine clarified with isinglass",
          "carmine",
          "shellac",
          "worcestershire sauce (traditional)"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "I'll check each item against the vegan rules (no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce):\n\n- **honey** – from bees → NOT vegan\n- **agave nectar** – plant-derived → vegan\n- **wine clarified with isinglass** – isinglass is fish-derived → NOT vegan\n- **wine clarified with bentonite** – bentonite is a mineral clay → vegan\n- **carmine** – from cochineal insects → NOT vegan\n- **nutritional yeast** – fungal/yeast, no animal products → vegan\n- **shellac** – secreted by lac insects → NOT vegan\n- **maple syrup** – plant-derived → vegan\n- **worcestershire sauce (traditional)** – contains anchovies (fish sauce) → NOT vegan\n- **vegetable glycerin** – plant-derived → vegan\n\nset:honey, wine clarified with isinglass, carmine, shellac, worcestershire sauce (traditional)",
          "compliant": true,
          "score": 0.2759,
          "detail": {
            "predicted": [
              "- **agave nectar** – plant-derived → vegan",
              "- **carmine** – from cochineal insects → NOT vegan",
              "- **honey** – from bees → NOT vegan",
              "- **maple syrup** – plant-derived → vegan",
              "- **nutritional yeast** – fungal/yeast",
              "- **shellac** – secreted by lac insects → NOT vegan",
              "- **vegetable glycerin** – plant-derived → vegan",
              "- **wine clarified with bentonite** – bentonite is a mineral clay → vegan",
              "- **wine clarified with isinglass** – isinglass is fish-derived → NOT vegan",
              "- **worcestershire sauce (traditional)** – contains anchovies (fish sauce) → NOT vegan",
              "I'll check each item against the vegan rules (no animal-derived ingredients",
              "animal rennet",
              "carmine",
              "dairy",
              "eggs",
              "gelatine",
              "including honey",
              "isinglass",
              "no animal products → vegan",
              "or fish sauce):",
              "set:honey",
              "shellac",
              "wine clarified with isinglass",
              "worcestershire sauce (traditional)"
            ],
            "expected": [
              "carmine",
              "honey",
              "shellac",
              "wine clarified with isinglass",
              "worcestershire sauce (traditional)"
            ],
            "precision": 0.1667,
            "recall": 0.8
          },
          "metrics": {
            "input_tokens": 1128,
            "output_tokens": 252,
            "latency_ms": 4043.6,
            "usd": 0.007164,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "I'll evaluate each item against vegan rules (no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce):\n\n- **honey** – from bees → NOT vegan\n- **agave nectar** – plant-derived → vegan\n- **wine clarified with isinglass** – isinglass is fish-derived → NOT vegan\n- **wine clarified with bentonite** – bentonite is a clay (mineral) → vegan\n- **carmine** – from cochineal insects → NOT vegan\n- **nutritional yeast** – fungus, plant-based → vegan\n- **shellac** – from lac insects → NOT vegan\n- **maple syrup** – plant-derived → vegan\n- **worcestershire sauce (traditional)** – contains anchovies (fish sauce) → NOT vegan\n- **vegetable glycerin** – plant-derived → vegan\n\nset:honey, wine clarified with isinglass, carmine, shellac, worcestershire sauce (traditional)",
          "compliant": true,
          "score": 0.2759,
          "detail": {
            "predicted": [
              "- **agave nectar** – plant-derived → vegan",
              "- **carmine** – from cochineal insects → NOT vegan",
              "- **honey** – from bees → NOT vegan",
              "- **maple syrup** – plant-derived → vegan",
              "- **nutritional yeast** – fungus",
              "- **shellac** – from lac insects → NOT vegan",
              "- **vegetable glycerin** – plant-derived → vegan",
              "- **wine clarified with bentonite** – bentonite is a clay (mineral) → vegan",
              "- **wine clarified with isinglass** – isinglass is fish-derived → NOT vegan",
              "- **worcestershire sauce (traditional)** – contains anchovies (fish sauce) → NOT vegan",
              "I'll evaluate each item against vegan rules (no animal-derived ingredients",
              "animal rennet",
              "carmine",
              "dairy",
              "eggs",
              "gelatine",
              "including honey",
              "isinglass",
              "or fish sauce):",
              "plant-based → vegan",
              "set:honey",
              "shellac",
              "wine clarified with isinglass",
              "worcestershire sauce (traditional)"
            ],
            "expected": [
              "carmine",
              "honey",
              "shellac",
              "wine clarified with isinglass",
              "worcestershire sauce (traditional)"
            ],
            "precision": 0.1667,
            "recall": 0.8
          },
          "metrics": {
            "input_tokens": 1128,
            "output_tokens": 248,
            "latency_ms": 4362.9,
            "usd": 0.007104,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "I'll evaluate each item against the vegan rule (no animal-derived ingredients):\n\n- **honey** – from bees → NOT vegan\n- **agave nectar** – plant-derived → vegan\n- **wine clarified with isinglass** – isinglass is fish-derived → NOT vegan\n- **wine clarified with bentonite** – bentonite is a mineral clay → vegan\n- **carmine** – from cochineal insects → NOT vegan\n- **nutritional yeast** – fungus, plant-based → vegan\n- **shellac** – secreted by lac insects → NOT vegan\n- **maple syrup** – plant-derived → vegan\n- **worcestershire sauce (traditional)** – contains anchovies → NOT vegan\n- **vegetable glycerin** – plant-derived → vegan\n\nset:honey, wine clarified with isinglass, carmine, shellac, worcestershire sauce (traditional)",
          "compliant": true,
          "score": 0.3636,
          "detail": {
            "predicted": [
              "- **agave nectar** – plant-derived → vegan",
              "- **carmine** – from cochineal insects → NOT vegan",
              "- **honey** – from bees → NOT vegan",
              "- **maple syrup** – plant-derived → vegan",
              "- **nutritional yeast** – fungus",
              "- **shellac** – secreted by lac insects → NOT vegan",
              "- **vegetable glycerin** – plant-derived → vegan",
              "- **wine clarified with bentonite** – bentonite is a mineral clay → vegan",
              "- **wine clarified with isinglass** – isinglass is fish-derived → NOT vegan",
              "- **worcestershire sauce (traditional)** – contains anchovies → NOT vegan",
              "I'll evaluate each item against the vegan rule (no animal-derived ingredients):",
              "carmine",
              "plant-based → vegan",
              "set:honey",
              "shellac",
              "wine clarified with isinglass",
              "worcestershire sauce (traditional)"
            ],
            "expected": [
              "carmine",
              "honey",
              "shellac",
              "wine clarified with isinglass",
              "worcestershire sauce (traditional)"
            ],
            "precision": 0.2353,
            "recall": 0.8
          },
          "metrics": {
            "input_tokens": 1128,
            "output_tokens": 216,
            "latency_ms": 4441.4,
            "usd": 0.006624,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.3051,
      "score_summary": {
        "mean": 0.3051,
        "stdev": 0.0506,
        "sem": 0.0292,
        "ci95_lo": 0.2478,
        "ci95_hi": 0.3624,
        "n": 3
      },
      "score_if_valid": 0.3051,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1128.0,
        "output_tokens_mean": 238.667,
        "latency_ms_mean": 4282.633,
        "usd_sum": 0.020892
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-010",
      "category": "dietary",
      "capability": "cross_contact_risk",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "set_f1",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "35c29846f76046d7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nA kitchen fries breaded calamari, battered cod, and french fries in a single deep fryer. Return the EU allergens that a customer ordering 'fries only' must be warned about as a comma-separated OGS allergen ID list.\n\nINPUT:\n{\n  \"scenario\": \"single shared fryer\",\n  \"items\": [\n    \"breaded calamari\",\n    \"battered cod\",\n    \"french fries\"\n  ]\n}\n\nANSWER FORMAT: set:comma-separated OGS allergen IDs",
      "expected": {
        "values": [
          "molluscs",
          "fish",
          "cereals_containing_gluten"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "I need to identify all allergens present in the shared fryer that a \"fries only\" customer must be warned about due to cross-contamination.\n\n**Allergens from each item cooked in the shared fryer:**\n\n1. **Breaded calamari**:\n   - Breading: `cereals_containing_gluten`\n   - Calamari (squid): `molluscs`\n   - May also contain `eggs` (in breading batter)\n\n2. **Battered cod**:\n   - Batter: `cereals_containing_gluten`\n   - Cod: `fish`\n   - Batter typically contains `eggs` and/or `milk`\n\n3. **French fries** (base ingredient — potato, oil):\n   - No major allergens intrinsically\n\nSince all items share the same fryer, the oil becomes contaminated with allergens from all items. A customer ordering fries only must be warned about all allergens introduced into the fryer:\n\n- `cereals_containing_gluten` (from breading and batter)\n- `molluscs` (from calamari/squid)\n- `fish` (from cod)\n- `eggs` (commonly in breading/batter)\n- `milk` (commonly in batter)\n\nset:cereals_containing_gluten,molluscs,fish,eggs,milk",
          "compliant": true,
          "score": 0.1429,
          "detail": {
            "predicted": [
              "**Allergens from each item cooked in the shared fryer:**",
              "- Batter typically contains `eggs` and/or `milk",
              "- Batter: `cereals_containing_gluten",
              "- Breading: `cereals_containing_gluten",
              "- Calamari (squid): `molluscs",
              "- Cod: `fish",
              "- May also contain `eggs` (in breading batter)",
              "- No major allergens intrinsically",
              "- `cereals_containing_gluten` (from breading and batter)",
              "- `eggs` (commonly in breading/batter)",
              "- `fish` (from cod)",
              "- `milk` (commonly in batter)",
              "- `molluscs` (from calamari/squid)",
              "1. **Breaded calamari**:",
              "2. **Battered cod**:",
              "3. **French fries** (base ingredient — potato",
              "I need to identify all allergens present in the shared fryer that a \"fries only\" customer must be warned about due to cross-contamination.",
              "Since all items share the same fryer",
              "eggs",
              "fish",
              "milk",
              "molluscs",
              "oil):",
              "set:cereals_containing_gluten",
              "the oil becomes contaminated with allergens from all items. A customer ordering fries only must be warned about all allergens introduced into the fryer:"
            ],
            "expected": [
              "cereals_containing_gluten",
              "fish",
              "molluscs"
            ],
            "precision": 0.08,
            "recall": 0.6667
          },
          "metrics": {
            "input_tokens": 1077,
            "output_tokens": 319,
            "latency_ms": 6062.6,
            "usd": 0.008016,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "I need to identify all allergens present in the shared fryer, since cross-contamination means a customer ordering fries only must be warned about allergens from all items cooked in that fryer.\n\n**Breaded calamari:**\n- Breadcrumbs → `cereals_containing_gluten`\n- Calamari (squid) → `molluscs`\n- May contain eggs in the breading → `eggs`\n\n**Battered cod:**\n- Batter (flour) → `cereals_containing_gluten`\n- Cod → `fish`\n- Batter often contains `eggs` and `milk`\n\n**French fries:**\n- Potato only (no allergens inherently)\n\n**Cross-contamination from shared fryer:**\nAll allergens from breaded calamari and battered cod contaminate the fries.\n\nAllergens to declare:\n- `cereals_containing_gluten` (breadcrumbs, batter)\n- `molluscs` (calamari/squid)\n- `fish` (cod)\n- `eggs` (breading/batter)\n- `milk` (batter)\n\nset:cereals_containing_gluten,molluscs,fish,eggs,milk",
          "compliant": true,
          "score": 0.1379,
          "detail": {
            "predicted": [
              "**Battered cod:**",
              "**Breaded calamari:**",
              "**Cross-contamination from shared fryer:**",
              "**French fries:**",
              "- Batter (flour) → `cereals_containing_gluten",
              "- Batter often contains `eggs` and `milk",
              "- Breadcrumbs → `cereals_containing_gluten",
              "- Calamari (squid) → `molluscs",
              "- Cod → `fish",
              "- May contain eggs in the breading → `eggs",
              "- Potato only (no allergens inherently)",
              "- `cereals_containing_gluten` (breadcrumbs",
              "- `eggs` (breading/batter)",
              "- `fish` (cod)",
              "- `milk` (batter)",
              "- `molluscs` (calamari/squid)",
              "All allergens from breaded calamari and battered cod contaminate the fries.",
              "Allergens to declare:",
              "I need to identify all allergens present in the shared fryer",
              "batter)",
              "eggs",
              "fish",
              "milk",
              "molluscs",
              "set:cereals_containing_gluten",
              "since cross-contamination means a customer ordering fries only must be warned about allergens from all items cooked in that fryer."
            ],
            "expected": [
              "cereals_containing_gluten",
              "fish",
              "molluscs"
            ],
            "precision": 0.0769,
            "recall": 0.6667
          },
          "metrics": {
            "input_tokens": 1077,
            "output_tokens": 287,
            "latency_ms": 7003.4,
            "usd": 0.007536,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "I need to identify all allergens present in the shared fryer that could contaminate the french fries.\n\n**French fries themselves** contain no major allergens (just potato and oil).\n\n**Cross-contamination from the shared fryer:**\n\n1. **Breaded calamari** contains:\n   - `cereals_containing_gluten` (breadcrumbs)\n   - `molluscs` (calamari/squid)\n   - Possibly `eggs` (breading batter)\n\n2. **Battered cod** contains:\n   - `cereals_containing_gluten` (batter)\n   - `fish` (cod)\n   - Possibly `eggs` (batter)\n   - Possibly `milk` (batter)\n\nSince all items share the same fryer, a customer ordering fries only must be warned about all allergens introduced via cross-contamination from the other items.\n\nAllergens to declare:\n\n- `cereals_containing_gluten` (from breading/batter)\n- `molluscs` (from calamari)\n- `fish` (from cod)\n- `eggs` (from breading/batter)\n- `milk` (from batter)\n\nset:cereals_containing_gluten,molluscs,fish,eggs,milk",
          "compliant": true,
          "score": 0.1429,
          "detail": {
            "predicted": [
              "**Cross-contamination from the shared fryer:**",
              "**French fries themselves** contain no major allergens (just potato and oil).",
              "- Possibly `eggs` (batter)",
              "- Possibly `eggs` (breading batter)",
              "- Possibly `milk` (batter)",
              "- `cereals_containing_gluten` (batter)",
              "- `cereals_containing_gluten` (breadcrumbs)",
              "- `cereals_containing_gluten` (from breading/batter)",
              "- `eggs` (from breading/batter)",
              "- `fish` (cod)",
              "- `fish` (from cod)",
              "- `milk` (from batter)",
              "- `molluscs` (calamari/squid)",
              "- `molluscs` (from calamari)",
              "1. **Breaded calamari** contains:",
              "2. **Battered cod** contains:",
              "Allergens to declare:",
              "I need to identify all allergens present in the shared fryer that could contaminate the french fries.",
              "Since all items share the same fryer",
              "a customer ordering fries only must be warned about all allergens introduced via cross-contamination from the other items.",
              "eggs",
              "fish",
              "milk",
              "molluscs",
              "set:cereals_containing_gluten"
            ],
            "expected": [
              "cereals_containing_gluten",
              "fish",
              "molluscs"
            ],
            "precision": 0.08,
            "recall": 0.6667
          },
          "metrics": {
            "input_tokens": 1077,
            "output_tokens": 301,
            "latency_ms": 5553.7,
            "usd": 0.007746,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.1412,
      "score_summary": {
        "mean": 0.1412,
        "stdev": 0.0029,
        "sem": 0.0017,
        "ci95_lo": 0.138,
        "ci95_hi": 0.1445,
        "n": 3
      },
      "score_if_valid": 0.1412,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1077.0,
        "output_tokens_mean": 302.333,
        "latency_ms_mean": 6206.567,
        "usd_sum": 0.023298
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-011",
      "category": "dietary",
      "capability": "low_fodmap",
      "difficulty": "hard",
      "cuisine_region": "global",
      "scorer": "set_f1",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "bdd7970997c11172",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nReturn the low-FODMAP violations in this dish as a comma-separated list of ingredient names: garlic-infused olive oil, white onion, lactose-free milk, wheat pasta, lean chicken, fresh chives, honey, aged parmesan.\n\nINPUT:\n{\n  \"ingredients\": [\n    \"garlic-infused olive oil\",\n    \"white onion\",\n    \"lactose-free milk\",\n    \"wheat pasta\",\n    \"lean chicken\",\n    \"fresh chives\",\n    \"honey\",\n    \"aged parmesan\"\n  ]\n}\n\nANSWER FORMAT: set:comma-separated ingredient names",
      "expected": {
        "values": [
          "white onion",
          "wheat pasta",
          "honey"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "I need to identify which ingredients violate low-FODMAP rules.\n\nLet me check each ingredient:\n\n- **garlic-infused olive oil**: Low-FODMAP compliant (the FODMAPs in garlic don't transfer to oil, so garlic-infused oil is generally considered low-FODMAP) ✓\n- **white onion**: High-FODMAP (onions are explicitly listed as high-FODMAP) ✗\n- **lactose-free milk**: Low-FODMAP compliant (lactose removed) ✓\n- **wheat pasta**: High-FODMAP (wheat is explicitly listed as high-FODMAP) ✗\n- **lean chicken**: Low-FODMAP compliant (plain meat is fine) ✓\n- **fresh chives**: Low-FODMAP compliant (chives are low-FODMAP) ✓\n- **honey**: High-FODMAP (honey is explicitly listed as high-FODMAP) ✗\n- **aged parmesan**: Low-FODMAP compliant (aged hard cheeses are low in lactose and considered low-FODMAP) ✓\n\nset:white onion, wheat pasta, honey",
          "compliant": true,
          "score": 0.2353,
          "detail": {
            "predicted": [
              "- **aged parmesan**: Low-FODMAP compliant (aged hard cheeses are low in lactose and considered low-FODMAP) ✓",
              "- **fresh chives**: Low-FODMAP compliant (chives are low-FODMAP) ✓",
              "- **garlic-infused olive oil**: Low-FODMAP compliant (the FODMAPs in garlic don't transfer to oil",
              "- **honey**: High-FODMAP (honey is explicitly listed as high-FODMAP) ✗",
              "- **lactose-free milk**: Low-FODMAP compliant (lactose removed) ✓",
              "- **lean chicken**: Low-FODMAP compliant (plain meat is fine) ✓",
              "- **wheat pasta**: High-FODMAP (wheat is explicitly listed as high-FODMAP) ✗",
              "- **white onion**: High-FODMAP (onions are explicitly listed as high-FODMAP) ✗",
              "I need to identify which ingredients violate low-FODMAP rules.",
              "Let me check each ingredient:",
              "honey",
              "set:white onion",
              "so garlic-infused oil is generally considered low-FODMAP) ✓",
              "wheat pasta"
            ],
            "expected": [
              "honey",
              "wheat pasta",
              "white onion"
            ],
            "precision": 0.1429,
            "recall": 0.6667
          },
          "metrics": {
            "input_tokens": 1096,
            "output_tokens": 276,
            "latency_ms": 4334.7,
            "usd": 0.007428,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "I need to identify which ingredients violate low-FODMAP rules.\n\nLow-FODMAP restrictions: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nLet me check each ingredient:\n\n1. **garlic-infused olive oil** - If properly prepared (oil only, no garlic pieces), this is generally considered low-FODMAP as FODMAPs don't dissolve in oil. However, the name suggests garlic content. By strict rules, garlic is high-FODMAP. This is borderline, but typically allowed in low-FODMAP. I'll consider it compliant as a standard allowance.\n\n2. **white onion** - Onion is explicitly listed as a high-FODMAP ingredient. **VIOLATION**\n\n3. **lactose-free milk** - Lactose-free, so compliant.\n\n4. **wheat pasta** - Wheat is explicitly listed as high-FODMAP. **VIOLATION**\n\n5. **lean chicken** - Plain meat, low-FODMAP. Compliant.\n\n6. **fresh chives** - Chives are generally considered low-FODMAP in small amounts. Compliant.\n\n7. **honey** - Honey is explicitly listed as high-FODMAP. **VIOLATION**\n\n8. **aged parmesan** - Aged hard cheeses are low-FODMAP. Compliant.\n\nset:white onion, wheat pasta, honey",
          "compliant": true,
          "score": 0.1379,
          "detail": {
            "predicted": [
              "1. **garlic-infused olive oil** - If properly prepared (oil only",
              "2. **white onion** - Onion is explicitly listed as a high-FODMAP ingredient. **VIOLATION**",
              "3. **lactose-free milk** - Lactose-free",
              "4. **wheat pasta** - Wheat is explicitly listed as high-FODMAP. **VIOLATION**",
              "5. **lean chicken** - Plain meat",
              "6. **fresh chives** - Chives are generally considered low-FODMAP in small amounts. Compliant.",
              "7. **honey** - Honey is explicitly listed as high-FODMAP. **VIOLATION**",
              "8. **aged parmesan** - Aged hard cheeses are low-FODMAP. Compliant.",
              "I need to identify which ingredients violate low-FODMAP rules.",
              "Let me check each ingredient:",
              "Low-FODMAP restrictions: restrict high-FODMAP ingredients (onion",
              "beans",
              "but typically allowed in low-FODMAP. I'll consider it compliant as a standard allowance.",
              "garlic",
              "garlic is high-FODMAP. This is borderline",
              "honey",
              "lactose",
              "low-FODMAP. Compliant.",
              "most stone fruit).",
              "no garlic pieces)",
              "set:white onion",
              "so compliant.",
              "the name suggests garlic content. By strict rules",
              "this is generally considered low-FODMAP as FODMAPs don't dissolve in oil. However",
              "wheat",
              "wheat pasta"
            ],
            "expected": [
              "honey",
              "wheat pasta",
              "white onion"
            ],
            "precision": 0.0769,
            "recall": 0.6667
          },
          "metrics": {
            "input_tokens": 1096,
            "output_tokens": 337,
            "latency_ms": 6210.6,
            "usd": 0.008343,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "I need to identify which ingredients violate low-FODMAP rules.\n\nLow-FODMAP restricts: onion, garlic, wheat, beans, lactose, honey, most stone fruit.\n\nLet me check each ingredient:\n\n- **garlic-infused olive oil**: If properly made (only infused, no garlic pieces), it's generally considered low-FODMAP. However, the name suggests it may contain garlic solids — I'll treat it as compliant if it's just the oil, but the name is ambiguous. Per strict OGS rules, garlic is a high-FODMAP ingredient, but garlic-infused oil (without garlic pieces) is typically allowed. I'll consider it compliant.\n- **white onion**: HIGH-FODMAP — onion is explicitly listed as a violation.\n- **lactose-free milk**: low-FODMAP compliant (lactose removed).\n- **wheat pasta**: HIGH-FODMAP — wheat is explicitly listed.\n- **lean chicken**: compliant.\n- **fresh chives**: Low-FODMAP compliant (chives are allowed in small amounts; they are a green part of an allium but generally considered low-FODMAP).\n- **honey**: HIGH-FODMAP — explicitly listed.\n- **aged parmesan**: Low-FODMAP compliant (aged cheeses are very low in lactose).\n\nset:white onion, wheat pasta, honey",
          "compliant": true,
          "score": 0.1429,
          "detail": {
            "predicted": [
              "- **aged parmesan**: Low-FODMAP compliant (aged cheeses are very low in lactose).",
              "- **fresh chives**: Low-FODMAP compliant (chives are allowed in small amounts; they are a green part of an allium but generally considered low-FODMAP).",
              "- **garlic-infused olive oil**: If properly made (only infused",
              "- **honey**: HIGH-FODMAP — explicitly listed.",
              "- **lactose-free milk**: low-FODMAP compliant (lactose removed).",
              "- **lean chicken**: compliant.",
              "- **wheat pasta**: HIGH-FODMAP — wheat is explicitly listed.",
              "- **white onion**: HIGH-FODMAP — onion is explicitly listed as a violation.",
              "I need to identify which ingredients violate low-FODMAP rules.",
              "Let me check each ingredient:",
              "Low-FODMAP restricts: onion",
              "beans",
              "but garlic-infused oil (without garlic pieces) is typically allowed. I'll consider it compliant.",
              "but the name is ambiguous. Per strict OGS rules",
              "garlic",
              "garlic is a high-FODMAP ingredient",
              "honey",
              "it's generally considered low-FODMAP. However",
              "lactose",
              "most stone fruit.",
              "no garlic pieces)",
              "set:white onion",
              "the name suggests it may contain garlic solids — I'll treat it as compliant if it's just the oil",
              "wheat",
              "wheat pasta"
            ],
            "expected": [
              "honey",
              "wheat pasta",
              "white onion"
            ],
            "precision": 0.08,
            "recall": 0.6667
          },
          "metrics": {
            "input_tokens": 1096,
            "output_tokens": 324,
            "latency_ms": 5971.9,
            "usd": 0.008148,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.172,
      "score_summary": {
        "mean": 0.172,
        "stdev": 0.0548,
        "sem": 0.0317,
        "ci95_lo": 0.11,
        "ci95_hi": 0.2341,
        "n": 3
      },
      "score_if_valid": 0.172,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1096.0,
        "output_tokens_mean": 312.333,
        "latency_ms_mean": 5505.733,
        "usd_sum": 0.023919
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:diet-012",
      "category": "dietary",
      "capability": "pescatarian_vs_vegetarian",
      "difficulty": "easy",
      "cuisine_region": "japanese",
      "scorer": "set_f1",
      "context_needs": [
        "rules.dietary",
        "rules.allergens_eu",
        "vocab.dietary_labels"
      ],
      "prompt_template_hash": "42c897d741064981",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n### Dietary labels\n\nvegan, vegetarian, pescatarian, halal, kosher, gluten_free, dairy_free, nut_free, shellfish_free, egg_free, soy_free, low_fodmap\n\n---\n\nA dish contains miso, dashi (made with katsuobushi), tofu, and wakame. Mark which labels it qualifies for out of: vegan, vegetarian, pescatarian. Return a comma-separated list.\n\nINPUT:\n{\n  \"dish\": \"classic awase-dashi miso soup with tofu and wakame\"\n}\n\nANSWER FORMAT: set:comma-separated labels",
      "expected": {
        "values": [
          "pescatarian"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Let me analyze each ingredient:\n\n- **Miso**: typically made from soybeans, salt, and koji — plant-based ✓\n- **Dashi (katsuobushi)**: made from dried fermented bonito (fish) — this is fish-derived ✗ for vegan/vegetarian, ✓ for pescatarian\n- **Tofu**: made from soybeans — plant-based ✓\n- **Wakame**: seaweed — plant-based ✓\n\nThe katsuobushi (bonito fish flakes) in the dashi means:\n- **vegan**: ✗ (contains fish)\n- **vegetarian**: ✗ (contains fish)\n- **pescatarian**: ✓ (fish is permitted)\n\nset:pescatarian",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": [
              "- **Dashi (katsuobushi)**: made from dried fermented bonito (fish) — this is fish-derived ✗ for vegan/vegetarian",
              "- **Miso**: typically made from soybeans",
              "- **Tofu**: made from soybeans — plant-based ✓",
              "- **Wakame**: seaweed — plant-based ✓",
              "- **pescatarian**: ✓ (fish is permitted)",
              "- **vegan**: ✗ (contains fish)",
              "- **vegetarian**: ✗ (contains fish)",
              "Let me analyze each ingredient:",
              "The katsuobushi (bonito fish flakes) in the dashi means:",
              "and koji — plant-based ✓",
              "salt",
              "set:pescatarian",
              "✓ for pescatarian"
            ],
            "expected": [
              "pescatarian"
            ],
            "precision": 0.0,
            "recall": 0.0
          },
          "metrics": {
            "input_tokens": 1043,
            "output_tokens": 187,
            "latency_ms": 4072.5,
            "usd": 0.005934,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Let me analyze each ingredient:\n\n- **Miso**: fermented soybean paste — plant-based ✓\n- **Dashi (katsuobushi)**: made with dried fermented tuna/bonito flakes — this is **fish**\n- **Tofu**: soy-based — plant-based ✓\n- **Wakame**: seaweed — plant-based ✓\n\nNow checking labels:\n- **Vegan**: No — contains fish (katsuobushi)\n- **Vegetarian**: No — contains fish (katsuobushi); fish is excluded from vegetarian\n- **Pescatarian**: Yes — pescatarian allows fish, and no meat or poultry is present\n\nset:pescatarian",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": [
              "- **Dashi (katsuobushi)**: made with dried fermented tuna/bonito flakes — this is **fish**",
              "- **Miso**: fermented soybean paste — plant-based ✓",
              "- **Pescatarian**: Yes — pescatarian allows fish",
              "- **Tofu**: soy-based — plant-based ✓",
              "- **Vegan**: No — contains fish (katsuobushi)",
              "- **Vegetarian**: No — contains fish (katsuobushi); fish is excluded from vegetarian",
              "- **Wakame**: seaweed — plant-based ✓",
              "Let me analyze each ingredient:",
              "Now checking labels:",
              "and no meat or poultry is present",
              "set:pescatarian"
            ],
            "expected": [
              "pescatarian"
            ],
            "precision": 0.0,
            "recall": 0.0
          },
          "metrics": {
            "input_tokens": 1043,
            "output_tokens": 171,
            "latency_ms": 3792.0,
            "usd": 0.005694,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Let me analyze each ingredient:\n\n- **Miso**: fermented soybean paste — vegan/vegetarian/pescatarian compatible\n- **Dashi (katsuobushi)**: made with dried bonito (fish) — not vegan, not vegetarian, but pescatarian compatible\n- **Tofu**: soy-based — vegan/vegetarian/pescatarian compatible\n- **Wakame**: seaweed — vegan/vegetarian/pescatarian compatible\n\nThe katsuobushi (fish) in the dashi disqualifies **vegan** and **vegetarian**, but is compatible with **pescatarian**.\n\nset:pescatarian",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": [
              "- **Dashi (katsuobushi)**: made with dried bonito (fish) — not vegan",
              "- **Miso**: fermented soybean paste — vegan/vegetarian/pescatarian compatible",
              "- **Tofu**: soy-based — vegan/vegetarian/pescatarian compatible",
              "- **Wakame**: seaweed — vegan/vegetarian/pescatarian compatible",
              "Let me analyze each ingredient:",
              "The katsuobushi (fish) in the dashi disqualifies **vegan** and **vegetarian**",
              "but is compatible with **pescatarian**.",
              "but pescatarian compatible",
              "not vegetarian",
              "set:pescatarian"
            ],
            "expected": [
              "pescatarian"
            ],
            "precision": 0.0,
            "recall": 0.0
          },
          "metrics": {
            "input_tokens": 1043,
            "output_tokens": 151,
            "latency_ms": 3335.1,
            "usd": 0.005394,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1043.0,
        "output_tokens_mean": 169.667,
        "latency_ms_mean": 3733.2,
        "usd_sum": 0.017022
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-001",
      "category": "menu_engineering",
      "capability": "matrix_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "138c09ed611a2d43",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA dish has above-average popularity and above-average contribution margin on the menu. Using the OGS menu-engineering matrix, which quadrant is it? Respond with one word.\n\nINPUT:\n{\n  \"popularity\": \"above\",\n  \"margin\": \"above\"\n}\n\nANSWER FORMAT: exact:star|plowhorse|puzzle|dog",
      "expected": {
        "value": "star"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Star",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "star",
            "expected": "star",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 558,
            "output_tokens": 4,
            "latency_ms": 851.9,
            "usd": 0.001734,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Star",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "star",
            "expected": "star",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 558,
            "output_tokens": 4,
            "latency_ms": 3988.4,
            "usd": 0.001734,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Star",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "star",
            "expected": "star",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 558,
            "output_tokens": 4,
            "latency_ms": 693.2,
            "usd": 0.001734,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 558.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1844.5,
        "usd_sum": 0.005202
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-002",
      "category": "menu_engineering",
      "capability": "matrix_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "b5df321fd829de76",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA dish sells strongly but the contribution margin is below the menu average. Classify it.\n\nINPUT:\n{\n  \"popularity\": \"above\",\n  \"margin\": \"below\"\n}\n\nANSWER FORMAT: exact:star|plowhorse|puzzle|dog",
      "expected": {
        "value": "plowhorse"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "plowhorse",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "plowhorse",
            "expected": "plowhorse",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 537,
            "output_tokens": 7,
            "latency_ms": 1481.6,
            "usd": 0.001716,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "plowhorse",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "plowhorse",
            "expected": "plowhorse",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 537,
            "output_tokens": 7,
            "latency_ms": 1124.1,
            "usd": 0.001716,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "plowhorse",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "plowhorse",
            "expected": "plowhorse",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 537,
            "output_tokens": 7,
            "latency_ms": 1385.9,
            "usd": 0.001716,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 537.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1330.533,
        "usd_sum": 0.005148
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-003",
      "category": "menu_engineering",
      "capability": "matrix_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "9795b72bdda437f1",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA dish has strong contribution margin but is rarely ordered. Classify it.\n\nINPUT:\n{\n  \"popularity\": \"below\",\n  \"margin\": \"above\"\n}\n\nANSWER FORMAT: exact:star|plowhorse|puzzle|dog",
      "expected": {
        "value": "puzzle"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "puzzle",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "puzzle",
            "expected": "puzzle",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 534,
            "output_tokens": 4,
            "latency_ms": 865.7,
            "usd": 0.001662,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "puzzle",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "puzzle",
            "expected": "puzzle",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 534,
            "output_tokens": 4,
            "latency_ms": 878.5,
            "usd": 0.001662,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "puzzle",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "puzzle",
            "expected": "puzzle",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 534,
            "output_tokens": 4,
            "latency_ms": 707.1,
            "usd": 0.001662,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 534.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 817.1,
        "usd_sum": 0.004986
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-004",
      "category": "menu_engineering",
      "capability": "matrix_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "fc0da6dcbbc842fb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA dish is rarely ordered and has a contribution margin below the menu average. Classify it.\n\nINPUT:\n{\n  \"popularity\": \"below\",\n  \"margin\": \"below\"\n}\n\nANSWER FORMAT: exact:star|plowhorse|puzzle|dog",
      "expected": {
        "value": "dog"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "dog",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "dog",
            "expected": "dog",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 538,
            "output_tokens": 4,
            "latency_ms": 689.0,
            "usd": 0.001674,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "dog",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "dog",
            "expected": "dog",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 538,
            "output_tokens": 4,
            "latency_ms": 1013.2,
            "usd": 0.001674,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "dog",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "dog",
            "expected": "dog",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 538,
            "output_tokens": 4,
            "latency_ms": 748.3,
            "usd": 0.001674,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 538.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 816.833,
        "usd_sum": 0.005022
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-005",
      "category": "menu_engineering",
      "capability": "food_cost",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "f33710f88f89121e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA dish has a food cost of £4.20 and a menu price of £14.00. Return the food-cost percentage as a single integer.\n\nINPUT:\n{\n  \"food_cost\": 4.2,\n  \"menu_price\": 14.0\n}\n\nANSWER FORMAT: numeric:integer percent",
      "expected": {
        "value": 30,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "30",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 30.0,
            "expected": 30.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 551,
            "output_tokens": 5,
            "latency_ms": 1095.4,
            "usd": 0.001728,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "30",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 30.0,
            "expected": 30.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 551,
            "output_tokens": 5,
            "latency_ms": 1037.5,
            "usd": 0.001728,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "30",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 30.0,
            "expected": 30.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 551,
            "output_tokens": 5,
            "latency_ms": 1318.0,
            "usd": 0.001728,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 551.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1150.3,
        "usd_sum": 0.005184
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-006",
      "category": "menu_engineering",
      "capability": "contribution_margin",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "7bb019ca9ca4b1be",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA dish has a food cost of $6.50 and a menu price of $22.00. Return the contribution margin in dollars as a single number with two decimals.\n\nINPUT:\n{\n  \"food_cost\": 6.5,\n  \"menu_price\": 22.0\n}\n\nANSWER FORMAT: numeric:decimal USD",
      "expected": {
        "value": 15.5,
        "tolerance": 0.05
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "15.50",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 15.5,
            "expected": 15.5,
            "tolerance": 0.05
          },
          "metrics": {
            "input_tokens": 555,
            "output_tokens": 7,
            "latency_ms": 1208.2,
            "usd": 0.00177,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "15.50",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 15.5,
            "expected": 15.5,
            "tolerance": 0.05
          },
          "metrics": {
            "input_tokens": 555,
            "output_tokens": 7,
            "latency_ms": 1009.0,
            "usd": 0.00177,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "15.50",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 15.5,
            "expected": 15.5,
            "tolerance": 0.05
          },
          "metrics": {
            "input_tokens": 555,
            "output_tokens": 7,
            "latency_ms": 961.2,
            "usd": 0.00177,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 555.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1059.467,
        "usd_sum": 0.00531
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-007",
      "category": "menu_engineering",
      "capability": "menu_rebalance",
      "difficulty": "hard",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "68b7fb2ec1e432f4",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nFive main courses have this (popularity %, margin $) profile: A (35, 18), B (30, 9), C (5, 6), D (20, 16), E (10, 7). Average popularity is 20%; average margin is $11.20. Which single dish is the dog? Respond with the letter only.\n\nINPUT:\n{\n  \"dishes\": [\n    {\n      \"id\": \"A\",\n      \"pop\": 35,\n      \"margin\": 18\n    },\n    {\n      \"id\": \"B\",\n      \"pop\": 30,\n      \"margin\": 9\n    },\n    {\n      \"id\": \"C\",\n      \"pop\": 5,\n      \"margin\": 6\n    },\n    {\n      \"id\": \"D\",\n      \"pop\": 20,\n      \"margin\": 16\n    },\n    {\n      \"id\": \"E\",\n      \"pop\": 10,\n      \"margin\": 7\n    }\n  ],\n  \"avg_pop\": 20,\n  \"avg_margin\": 11.2\n}\n\nANSWER FORMAT: exact:A|B|C|D|E",
      "expected": {
        "value": "C"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "C",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "c",
            "expected": "c",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 754,
            "output_tokens": 4,
            "latency_ms": 1146.1,
            "usd": 0.002322,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "C",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "c",
            "expected": "c",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 754,
            "output_tokens": 4,
            "latency_ms": 845.6,
            "usd": 0.002322,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "C",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "c",
            "expected": "c",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 754,
            "output_tokens": 4,
            "latency_ms": 689.2,
            "usd": 0.002322,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 754.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 893.633,
        "usd_sum": 0.006966
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-008",
      "category": "menu_engineering",
      "capability": "target_price",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles"
      ],
      "prompt_template_hash": "dc5bf7ed3a0583c2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nA chef wants a 28% food cost on a dish whose ingredient cost is £3.92. Return the target menu price in pounds, two decimals.\n\nINPUT:\n{\n  \"food_cost\": 3.92,\n  \"target_food_cost_pct\": 28\n}\n\nANSWER FORMAT: numeric:decimal GBP",
      "expected": {
        "value": 14.0,
        "tolerance": 0.05
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "14.00",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 14.0,
            "expected": 14.0,
            "tolerance": 0.05
          },
          "metrics": {
            "input_tokens": 556,
            "output_tokens": 7,
            "latency_ms": 731.0,
            "usd": 0.001773,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "14.00",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 14.0,
            "expected": 14.0,
            "tolerance": 0.05
          },
          "metrics": {
            "input_tokens": 556,
            "output_tokens": 7,
            "latency_ms": 823.8,
            "usd": 0.001773,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "The target menu price = food cost / food cost percentage = 3.92 / 0.28 = 14.00\n\n14.00",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 3.92,
            "expected": 14.0,
            "tolerance": 0.05
          },
          "metrics": {
            "input_tokens": 556,
            "output_tokens": 35,
            "latency_ms": 1341.9,
            "usd": 0.002193,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": 0.0133,
        "ci95_hi": 1.32,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 556.0,
        "output_tokens_mean": 16.333,
        "latency_ms_mean": 965.567,
        "usd_sum": 0.005739
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:menu-009",
      "category": "menu_engineering",
      "capability": "menu_critique",
      "difficulty": "hard",
      "cuisine_region": "global",
      "scorer": "rubric_judge",
      "context_needs": [
        "rules.menu_engineering",
        "vocab.courses",
        "vocab.component_roles",
        "vocab.match_types",
        "rules.dietary",
        "rules.allergens_eu"
      ],
      "prompt_template_hash": "8883cacb0aca5024",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Menu engineering reference\n\nClassical Boston Matrix quadrants (popularity vs. contribution margin, each split at the menu average):\n- **Star**: high popularity, high margin → feature prominently, do   not change.\n- **Plowhorse**: high popularity, low margin → redesign the dish to   cut food cost or reprice gently.\n- **Puzzle**: low popularity, high margin → reposition on the menu,   rename, or add a compelling description.\n- **Dog**: low popularity, low margin → remove or replace.\n\nContribution margin (CM) = menu price − food cost. Food-cost percentage = food cost / menu price. Industry target for casual dining: 28–35% food cost.\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n### OGS Dietary labels and rules\n\nCanonical dietary label IDs (lowercase snake_case):\n`vegan`, `vegetarian`, `pescatarian`, `halal`, `kosher`, `gluten_free`, `dairy_free`, `nut_free`, `shellfish_free`, `egg_free`, `soy_free`, `low_fodmap`\n\nHigh-level rules (non-exhaustive):\n- `vegan`: no animal-derived ingredients, including honey, dairy, eggs, gelatine, carmine, isinglass, shellac, animal rennet, or fish sauce.\n- `vegetarian`: no meat, poultry, fish, shellfish, or animal-derived rennet/gelatine. Dairy and eggs permitted.\n- `pescatarian`: no meat or poultry. Fish and shellfish permitted.\n- `halal`: no pork or pork derivatives, no alcohol, meat must be zabihah-slaughtered. Gelatine must be from halal source.\n- `kosher`: no pork or shellfish, no mixing of meat and dairy, meat must be shechita-slaughtered. Separate utensils for meat/dairy.\n- `gluten_free`: no wheat, barley, rye, or derivatives (including malt, soy sauce with wheat, seitan).\n- `dairy_free`: no milk, butter, cream, cheese, whey, casein, ghee, yogurt, or lactose.\n- `nut_free`: no tree nuts (almond, cashew, walnut, hazelnut, pecan, pistachio, macadamia, brazil, pine nut). Peanuts are legumes but are usually treated as a separate allergen.\n- `shellfish_free`: no crustaceans (shrimp, crab, lobster, crayfish) or molluscs (mussel, clam, oyster, scallop, squid, octopus).\n- `low_fodmap`: restrict high-FODMAP ingredients (onion, garlic, wheat, beans, lactose, honey, most stone fruit).\n\nA recipe is compatible with a label if and only if **all** its ingredients comply with the label's rules.\n\n### EU 14 declarable allergens (Regulation 1169/2011 Annex II)\n\n- `cereals_containing_gluten`\n- `crustaceans`\n- `eggs`\n- `fish`\n- `peanuts`\n- `soybeans`\n- `milk`\n- `tree_nuts`\n- `celery`\n- `mustard`\n- `sesame`\n- `sulphur_dioxide`\n- `lupin`\n- `molluscs`\n\nUse exactly these IDs when declaring allergens in OGS documents.\n\n---\n\nCritique the following five-course autumn tasting menu. Identify at least two concrete weaknesses (flow, seasonality, allergen risk, cost-to-price ratio, or structural monotony), suggest one specific, constructive change, and justify each point in 3 short paragraphs. Avoid generic praise.\n\nINPUT:\n{\n  \"menu\": [\n    {\n      \"course\": \"amuse\",\n      \"dish\": \"pumpkin velouté, sage oil\"\n    },\n    {\n      \"course\": \"appetizer\",\n      \"dish\": \"butternut squash risotto, brown butter, amaretti crumble\"\n    },\n    {\n      \"course\": \"fish\",\n      \"dish\": \"pan-seared scallops, pumpkin purée, crispy pancetta\"\n    },\n    {\n      \"course\": \"main\",\n      \"dish\": \"roast pork loin, sweet potato purée, cider jus\"\n    },\n    {\n      \"course\": \"dessert\",\n      \"dish\": \"pumpkin cheesecake, spiced pecan brittle\"\n    }\n  ],\n  \"target_food_cost_pct\": 30,\n  \"dietary_constraints\": \"must offer a viable vegetarian and a viable nut-free path\"\n}\n\nANSWER FORMAT: free text, 3 short paragraphs",
      "expected": {
        "rubric": [
          {
            "id": "diagnosis",
            "weight": 0.4,
            "description": "Identifies concrete, defensible weaknesses (flavor monotony from repeated pumpkin/squash, nut allergen in amaretti and pecans, heavy starches in consecutive courses).",
            "anchors": {
              "0.0": "No specific weakness identified.",
              "0.5": "Identifies one concrete weakness but misses another major one.",
              "1.0": "Identifies at least two of: pumpkin/squash repetition across 4 of 5 courses; nut allergen across amaretti and pecans; consecutive purée/starch heaviness."
            }
          },
          {
            "id": "remediation",
            "weight": 0.3,
            "description": "Offers a specific, actionable, seasonally-plausible change that addresses the identified weakness.",
            "anchors": {
              "0.0": "No change proposed or change is vague.",
              "0.5": "Change is specific but does not address the main weakness.",
              "1.0": "Concrete substitute with named ingredient/technique (e.g. swap scallop course to cured mackerel with pickled apple)."
            }
          },
          {
            "id": "justification",
            "weight": 0.3,
            "description": "Justification is grounded in OGS concepts (match types, structural balance, service flow) rather than marketing language.",
            "anchors": {
              "0.0": "Pure adjective salad.",
              "0.5": "Some reasoning, but appeals to authority not mechanism.",
              "1.0": "Explicitly cites a mechanism: intensity alignment, contrast, allergen cross-contact, food-cost math."
            }
          }
        ],
        "reference_answer": "Key weaknesses: (1) pumpkin/squash appears in 4 of 5 courses, collapsing the menu's flavour arc into a single beta-carotene/sweet-starch note; (2) the amaretti crumble (bitter almond) and the pecan brittle together block every nut-allergic guest from the tasting; (3) velouté, risotto, two purées and a cheesecake stack four viscous/starchy textures in a row with no acid or raw element. A concrete fix is to replace the scallop course with cured mackerel over pickled Granny Smith apple and wasabi-lime gel: it re-introduces sea/fruit/acid, breaks the texture loop, removes one pork-cross-contact dish, and simultaneously opens a pescatarian-friendly option. Justification: on OGS terms this swap adds a `contrast` match (acid/fat), raises `acidity_perception` mid-meal to refresh the palate before the roast, and removes one of the two tree-nut exposure points."
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "**Weakness 1 – Structural Monotony and Ingredient Repetition**\n\nPumpkin or butternut squash appears in four of the five courses (amuse, appetizer, fish, and dessert), with sweet potato in the main providing only a cosmetic variation on the same sweet, starchy, orange-fleshed profile. The result is a menu that feels like a single ingredient stretched across five plates rather than an evolving seasonal narrative. A tasting menu should build and contrast; here, the palate is never given permission to reset. The suggestion is to replace the fish course's pumpkin purée with a sharp, acidic celery-root remoulade or a fennel-and-apple slaw, introducing a brassica-anise-citrus note that creates genuine contrast before the richness of the pork main.\n\n**Weakness 2 – Allergen Accumulation and Failed Dietary Paths**\n\nThe menu states it must offer a nut-free path, yet amaretti crumble in the appetizer contains almonds, and spiced pecan brittle in the dessert contains tree nuts — both `tree_nuts` allergens under EU Regulation 1169/2011 Annex II. Simply removing those garnishes leaves structurally incomplete dishes with no designed alternative. The vegetarian path is similarly thin: removing pancetta from the scallop course and the pork main simultaneously strips two courses of their primary protein and fat component, leaving a guest with an amuse, a risotto, and a cheesecake — a deeply unbalanced progression. A dedicated vegetarian insert (e.g., a roasted celeriac steak with wild mushroom jus and chestnut gremolata as the main) and a nut-free dessert variant (e.g., toasted pumpkin seed praline instead of pecan brittle) should be architected into the menu from the start, not treated as afterthoughts.\n\n**Weakness 3 – Cost and Margin Pressure on the Fish Course**\n\nScallops are among the highest food-cost proteins available, and pairing them with pancetta and a made-to-order pumpkin purée in a five-course tasting format at a 30% food-cost target creates serious margin exposure. A mid-range tasting menu pricing the scallop course to hit that target would likely need to portion the scallops so modestly that the dish reads as underwhelming. The recommended change from Weakness 1 — swapping pumpkin purée for a celery-root preparation — also reduces food cost slightly, but the deeper fix is to reconsider whether scallops belong in this format at all, or whether a more cost-effective but equally seasonal fish (e.g., pan-roasted gurnard or cured sea trout with apple and horseradish) could deliver better margin, a different texture, and a flavour bridge that lifts the menu out of its sweet-starchy rut.",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_criterion": [
              {
                "id": "diagnosis",
                "weight": 0.4,
                "score": 1.0,
                "note": "It clearly identifies multiple concrete weaknesses: repeated pumpkin/squash, nut-allergen blockage, weak vegetarian path, and heavy sweet-starchy structure."
              },
              {
                "id": "remediation",
                "weight": 0.3,
                "score": 1.0,
                "note": "It proposes specific, actionable substitutions such as celery-root remoulade or fennel-apple slaw, a dedicated celeriac main, pumpkin seed praline, and alternative fish options."
              },
              {
                "id": "justification",
                "weight": 0.3,
                "score": 1.0,
                "note": "The reasoning is mechanism-based, citing palate reset, structural balance, allergen compliance/cross-path design, and food-cost pressure against a 30% target."
              }
            ],
            "summary": "The candidate directly critiques the menu with concrete, defensible weaknesses and avoids generic praise. It exceeds the minimum by offering several precise fixes and grounding them in flow, allergen design, and cost mechanics, while still fitting the requested three-paragraph free-text format.",
            "judge_model": "gpt-5.4",
            "judge_prompt_hash": "756db6a487714bca",
            "judge_validated": false
          },
          "metrics": {
            "input_tokens": 1461,
            "output_tokens": 664,
            "latency_ms": 18796.0,
            "usd": 0.014343,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "**Weakness 1 – Structural Monotony (Squash/Pumpkin Saturation)**\nFour of the five courses feature pumpkin or its close cucurbit relatives (butternut squash, sweet potato is a marginal escape), and three of those four use a purée or velouté as the primary textural vehicle. By the fish course a guest has already eaten pumpkin velouté and butternut squash purée; serving another pumpkin purée under the scallops collapses any sense of progression or surprise. The flavour arc never pivots—there is no acid-led course, no bitter note, no herbaceous counterpoint, and no textural shift from the soft, starchy base that recurs plate after plate. A menu should use a signature seasonal ingredient as a motif, not a paste applied to every dish.\n\n**Weakness 2 – Allergen and Dietary Constraint Conflicts**\nThe menu is required to support both a vegetarian path and a nut-free path, and it fails on both counts without significant dish substitution. The appetizer contains amaretti crumble, which is made from almonds (a declarable tree nut), immediately blocking the nut-free path for that course. Pancetta in the fish course and the pork loin main block the vegetarian path at two consecutive courses, with no stated vegetarian alternative for either—a risotto that becomes the vegetarian \"fish-course stand-in\" would mean repeating the squash risotto already served as the appetizer. The pumpkin cheesecake also contains milk (declarable allergen) and potentially eggs, and the pecan brittle is a second tree-nut hit, compounding the nut-free problem in the dessert course.\n\n**Suggested Change – Replace the Appetizer with a Celeriac and Chestnut Velouté... no: replace the fish-course pumpkin purée and appetizer altogether.** Specifically, swap the butternut squash risotto appetizer for a seared duck liver parfait with quince gel and sourdough for omnivores, with a celeriac and truffle tart as a vegetarian equivalent. This single substitution removes a duplicate squash vehicle, introduces a fat-rich, umami-forward bridge between the light amuse and the protein-heavy fish course, and provides a clearly differentiated vegetarian option that does not echo the amuse. It also reduces tree-nut dependency to a single dessert garnish, making the nut-free swap (replace pecan brittle with a sesame praline or candied seeds) far simpler to execute. The duck liver or truffle cost is higher, but the contribution margin recovers because the dish commands a meaningfully higher price point than risotto, keeping food-cost percentage within the 30% target.",
          "compliant": true,
          "score": 0.85,
          "detail": {
            "per_criterion": [
              {
                "id": "diagnosis",
                "weight": 0.4,
                "score": 1.0,
                "note": "It clearly identifies multiple concrete weaknesses: repeated pumpkin/squash across the menu, soft purée/starch monotony, and nut-free/vegetarian path failures."
              },
              {
                "id": "remediation",
                "weight": 0.3,
                "score": 0.5,
                "note": "It proposes specific substitutions, but the main change is muddled by suggesting multiple replacements and one introduces chestnut, which conflicts with the nut-free constraint."
              },
              {
                "id": "justification",
                "weight": 0.3,
                "score": 1.0,
                "note": "The reasoning is mechanism-based, citing flavour arc, texture progression, dietary blocking, and food-cost percentage rather than generic praise."
              }
            ],
            "summary": "The answer follows the requested three-paragraph free-text format and gives a strong, concrete critique grounded in menu structure and allergen logic. Its main weakness is that the proposed fix is somewhat unfocused and partially undermined by a chestnut-containing suggestion, though the justification remains technically sound.",
            "judge_model": "gpt-5.4",
            "judge_prompt_hash": "756db6a487714bca",
            "judge_validated": false
          },
          "metrics": {
            "input_tokens": 1461,
            "output_tokens": 627,
            "latency_ms": 16044.9,
            "usd": 0.013788,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "**Weakness 1: Structural Monotony — Pumpkin/Squash Dominates Every Course**\n\nFour of the five courses feature pumpkin or butternut squash as a primary component (velouté, risotto filling, purée on the scallops, and cheesecake). This is not seasonal storytelling — it is ingredient repetition that will fatigue the palate long before the main course arrives. A tasting menu should use autumn's full larder: celeriac, wild mushrooms, quince, chestnuts, bitter radicchio, and game all signal the season without hammering a single note. The structural monotony also undermines the contribution of each individual dish, because guests cannot appreciate any one squash preparation when they have already encountered it twice before it arrives.\n\n**Weakness 2: Dietary Constraint Gaps — Nut-Free and Vegetarian Paths Are Not Viable As Written**\n\nThe menu states it must offer a nut-free path, yet the appetizer carries amaretti crumble (almonds — a declarable tree nut) and the dessert carries pecan brittle (also a tree nut). Removing both leaves those courses structurally incomplete, not merely modified. Simultaneously, the vegetarian path collapses at the fish course (scallops with pancetta) and the main (pork loin), leaving a vegetarian guest with only the amuse and a modified appetizer — a two-course tasting menu. Neither constraint is genuinely satisfied; they require dedicated alternative dishes, not ad-hoc subtractions that strip the dish of its textural or flavour architecture.\n\n**Suggested Change: Replace the Appetizer with a Celeriac and Wild Mushroom Tart, Hazelnut-Free Herb Crumb**\n\nSwap the butternut squash risotto for a pressed celeriac and chanterelle tart with a Comté custard, thyme oil, and a panko-parsley crumb. This achieves several things simultaneously: it breaks the squash monotony immediately after the amuse, introduces umami and earthy forest notes that build logically toward the pork main, satisfies the vegetarian path as a substantial standalone course (Comté is vegetarian-rennet-available), and eliminates the tree-nut allergen risk at that course entirely. Celeriac and chanterelles are both peak-autumn ingredients, the food cost is competitive with risotto, and the tart format introduces a baked, structured texture absent from the current menu's otherwise predominantly soft, purée-heavy progression.",
          "compliant": true,
          "score": 0.97,
          "detail": {
            "per_criterion": [
              {
                "id": "diagnosis",
                "weight": 0.4,
                "score": 1.0,
                "note": "It clearly identifies multiple concrete weaknesses: repeated pumpkin/squash, nut-allergen exposure, and the vegetarian path failing the stated constraint."
              },
              {
                "id": "remediation",
                "weight": 0.3,
                "score": 1.0,
                "note": "It proposes a specific, seasonal substitution with named ingredients and technique that directly reduces squash repetition and removes one nut-allergen point."
              },
              {
                "id": "justification",
                "weight": 0.3,
                "score": 0.9,
                "note": "The reasoning is mechanism-based around palate fatigue, structural texture balance, dietary viability, and comparative food cost, though service-flow analysis could be slightly more explicit."
              }
            ],
            "summary": "The answer follows the requested three-paragraph free-text format and avoids generic praise. It diagnoses several defensible menu-design problems and offers a concrete, seasonally plausible fix with solid operational and sensory justification.",
            "judge_model": "gpt-5.4",
            "judge_prompt_hash": "756db6a487714bca",
            "judge_validated": false
          },
          "metrics": {
            "input_tokens": 1461,
            "output_tokens": 575,
            "latency_ms": 15159.9,
            "usd": 0.013008,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.94,
      "score_summary": {
        "mean": 0.94,
        "stdev": 0.0794,
        "sem": 0.0458,
        "ci95_lo": 0.8502,
        "ci95_hi": 1.0298,
        "n": 3
      },
      "score_if_valid": 0.94,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1461.0,
        "output_tokens_mean": 622.0,
        "latency_ms_mean": 16666.933,
        "usd_sum": 0.041139
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-ar-001",
      "category": "vocabulary",
      "capability": "ingredient_classification",
      "difficulty": "easy",
      "cuisine_region": "middle_eastern",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.ingredient_categories"
      ],
      "prompt_template_hash": "94e19d81d80368a1",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n---\n\nباستخدام مفردات OGS لتصنيفات المكونات، صنّف الزعتر بكلمة واحدة.\n\nINPUT:\n{\n  \"item\": \"زعتر\"\n}\n\nANSWER FORMAT: one_of:protein,vegetable,fruit,grain,dairy,fat,herb,spice,fungus,legume,nut,seed,sweetener,acid,salt,other",
      "expected": {
        "value": "herb",
        "aliases": [
          "spice"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "herb",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herb",
            "expected": "herb",
            "aliases": [
              "spice"
            ]
          },
          "metrics": {
            "input_tokens": 404,
            "output_tokens": 4,
            "latency_ms": 798.3,
            "usd": 0.001272,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "herb",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herb",
            "expected": "herb",
            "aliases": [
              "spice"
            ]
          },
          "metrics": {
            "input_tokens": 404,
            "output_tokens": 4,
            "latency_ms": 774.7,
            "usd": 0.001272,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "herb",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herb",
            "expected": "herb",
            "aliases": [
              "spice"
            ]
          },
          "metrics": {
            "input_tokens": 404,
            "output_tokens": 4,
            "latency_ms": 1419.6,
            "usd": 0.001272,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 404.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 997.533,
        "usd_sum": 0.003816
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-ar-002",
      "category": "regional",
      "capability": "region_identification",
      "difficulty": "easy",
      "cuisine_region": "middle_eastern",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "6b39ec376be689ad",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nالمنسف هو الطبق الوطني لأي بلد عربي؟ أجب بكلمة واحدة.\n\nINPUT:\n{\n  \"dish\": \"المنسف\"\n}\n\nANSWER FORMAT: exact:country",
      "expected": {
        "value": "jordan",
        "aliases": [
          "الأردن",
          "jordan (الأردن)"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "الأردن",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "الأردن",
            "expected": "jordan",
            "aliases": [
              "الأردن",
              "jordan (الأردن)"
            ]
          },
          "metrics": {
            "input_tokens": 313,
            "output_tokens": 7,
            "latency_ms": 1065.7,
            "usd": 0.001044,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "الأردن",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "الأردن",
            "expected": "jordan",
            "aliases": [
              "الأردن",
              "jordan (الأردن)"
            ]
          },
          "metrics": {
            "input_tokens": 313,
            "output_tokens": 7,
            "latency_ms": 892.8,
            "usd": 0.001044,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "الأردن",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "الأردن",
            "expected": "jordan",
            "aliases": [
              "الأردن",
              "jordan (الأردن)"
            ]
          },
          "metrics": {
            "input_tokens": 313,
            "output_tokens": 7,
            "latency_ms": 885.0,
            "usd": 0.001044,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 313.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 947.833,
        "usd_sum": 0.003132
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-es-001",
      "category": "regional",
      "capability": "region_identification",
      "difficulty": "easy",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "5fe5c7f1f85e3675",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nEl pulpo a feira es un plato tradicional de qué región de España? Responde con una palabra.\n\nINPUT:\n{\n  \"dish\": \"pulpo a feira\"\n}\n\nANSWER FORMAT: exact:region name",
      "expected": {
        "value": "galicia",
        "aliases": [
          "galícia",
          "galiza"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Galicia",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "galicia",
            "expected": "galicia",
            "aliases": [
              "galícia",
              "galiza"
            ]
          },
          "metrics": {
            "input_tokens": 300,
            "output_tokens": 6,
            "latency_ms": 999.5,
            "usd": 0.00099,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Galicia",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "galicia",
            "expected": "galicia",
            "aliases": [
              "galícia",
              "galiza"
            ]
          },
          "metrics": {
            "input_tokens": 300,
            "output_tokens": 6,
            "latency_ms": 773.4,
            "usd": 0.00099,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Galicia",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "galicia",
            "expected": "galicia",
            "aliases": [
              "galícia",
              "galiza"
            ]
          },
          "metrics": {
            "input_tokens": 300,
            "output_tokens": 6,
            "latency_ms": 1736.2,
            "usd": 0.00099,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 300.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 1169.7,
        "usd_sum": 0.00297
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-es-002",
      "category": "vocabulary",
      "capability": "cooking_method_classification",
      "difficulty": "easy",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "9f03b90711bec40f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nUsando el vocabulario OGS de métodos de cocción, ¿cuál describe mejor la preparación clásica del cochinillo segoviano? Responde con una palabra.\n\nINPUT:\n{\n  \"item\": \"cochinillo segoviano\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "roasted"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "roasted",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "roasted",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 458,
            "output_tokens": 5,
            "latency_ms": 819.3,
            "usd": 0.001449,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "roasted",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "roasted",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 458,
            "output_tokens": 5,
            "latency_ms": 797.5,
            "usd": 0.001449,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "roasted",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "roasted",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 458,
            "output_tokens": 5,
            "latency_ms": 787.6,
            "usd": 0.001449,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 458.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 801.467,
        "usd_sum": 0.004347
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-fr-001",
      "category": "regional",
      "capability": "region_identification",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "8d252ae3fb170e15",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nLa bouillabaisse est une préparation traditionnelle de quelle ville française ? Répondez par un mot.\n\nINPUT:\n{\n  \"dish\": \"bouillabaisse\"\n}\n\nANSWER FORMAT: exact:city name",
      "expected": {
        "value": "marseille"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Marseille",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "marseille",
            "expected": "marseille",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 304,
            "output_tokens": 6,
            "latency_ms": 725.0,
            "usd": 0.001002,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Marseille",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "marseille",
            "expected": "marseille",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 304,
            "output_tokens": 6,
            "latency_ms": 852.6,
            "usd": 0.001002,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Marseille",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "marseille",
            "expected": "marseille",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 304,
            "output_tokens": 6,
            "latency_ms": 741.5,
            "usd": 0.001002,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 304.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 773.033,
        "usd_sum": 0.003006
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-fr-002",
      "category": "sensory",
      "capability": "basic_tastes_profile",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "96a07aeab44536ab",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nPour une sauce beurre blanc classique, donnez le profil sur l'échelle OGS des cinq saveurs [sweet, sour, salty, bitter, umami] sous la forme d'un tableau JSON de cinq nombres entre 0 et 10.\n\nINPUT:\n{\n  \"dish\": \"sauce beurre blanc classique\"\n}\n\nANSWER FORMAT: json:array of 5 numbers",
      "expected": {
        "values": [
          1,
          5,
          4,
          1,
          3
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[1.5, 6.5, 3.5, 0.5, 1.0]",
          "compliant": true,
          "score": 0.4,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 1.5,
                "expected": 1,
                "score": 0.6667
              },
              {
                "index": 1,
                "predicted": 6.5,
                "expected": 5,
                "score": 0.0
              },
              {
                "index": 2,
                "predicted": 3.5,
                "expected": 4,
                "score": 0.6667
              },
              {
                "index": 3,
                "predicted": 0.5,
                "expected": 1,
                "score": 0.6667
              },
              {
                "index": 4,
                "predicted": 1.0,
                "expected": 3,
                "score": 0.0
              }
            ],
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 431,
            "output_tokens": 28,
            "latency_ms": 2952.1,
            "usd": 0.001713,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[1.5, 6.5, 3.5, 0.5, 1.0]",
          "compliant": true,
          "score": 0.4,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 1.5,
                "expected": 1,
                "score": 0.6667
              },
              {
                "index": 1,
                "predicted": 6.5,
                "expected": 5,
                "score": 0.0
              },
              {
                "index": 2,
                "predicted": 3.5,
                "expected": 4,
                "score": 0.6667
              },
              {
                "index": 3,
                "predicted": 0.5,
                "expected": 1,
                "score": 0.6667
              },
              {
                "index": 4,
                "predicted": 1.0,
                "expected": 3,
                "score": 0.0
              }
            ],
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 431,
            "output_tokens": 28,
            "latency_ms": 1065.6,
            "usd": 0.001713,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[1.5, 6.5, 3.5, 0.5, 1.0]",
          "compliant": true,
          "score": 0.4,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 1.5,
                "expected": 1,
                "score": 0.6667
              },
              {
                "index": 1,
                "predicted": 6.5,
                "expected": 5,
                "score": 0.0
              },
              {
                "index": 2,
                "predicted": 3.5,
                "expected": 4,
                "score": 0.6667
              },
              {
                "index": 3,
                "predicted": 0.5,
                "expected": 1,
                "score": 0.6667
              },
              {
                "index": 4,
                "predicted": 1.0,
                "expected": 3,
                "score": 0.0
              }
            ],
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 431,
            "output_tokens": 28,
            "latency_ms": 1409.0,
            "usd": 0.001713,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.4,
      "score_summary": {
        "mean": 0.4,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.4,
        "ci95_hi": 0.4,
        "n": 3
      },
      "score_if_valid": 0.4,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 431.0,
        "output_tokens_mean": 28.0,
        "latency_ms_mean": 1808.9,
        "usd_sum": 0.005139
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-it-001",
      "category": "regional",
      "capability": "region_identification",
      "difficulty": "easy",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "c3aae88ebfcbe08b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nI pici sono una pasta fresca tipica di quale regione italiana? Rispondi con una parola.\n\nINPUT:\n{\n  \"dish\": \"pici cacio e pepe\"\n}\n\nANSWER FORMAT: exact:regione",
      "expected": {
        "value": "toscana"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Toscana",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "toscana",
            "expected": "toscana",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 303,
            "output_tokens": 7,
            "latency_ms": 755.5,
            "usd": 0.001014,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Toscana",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "toscana",
            "expected": "toscana",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 303,
            "output_tokens": 7,
            "latency_ms": 1045.5,
            "usd": 0.001014,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Toscana",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "toscana",
            "expected": "toscana",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 303,
            "output_tokens": 7,
            "latency_ms": 2040.6,
            "usd": 0.001014,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 303.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1280.533,
        "usd_sum": 0.003042
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-it-002",
      "category": "pairing",
      "capability": "beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.beverage_types",
        "vocab.match_types"
      ],
      "prompt_template_hash": "30919f613b1fbcd6",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nScegli l'abbinamento più classico per l'ossobuco alla milanese fra: barolo, riesling, prosecco, lambrusco. Rispondi con una parola.\n\nINPUT:\n{\n  \"dish\": \"ossobuco alla milanese\"\n}\n\nANSWER FORMAT: exact:one of the given options",
      "expected": {
        "value": "barolo"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "barolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "barolo",
            "expected": "barolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 372,
            "output_tokens": 5,
            "latency_ms": 722.1,
            "usd": 0.001191,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "barolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "barolo",
            "expected": "barolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 372,
            "output_tokens": 5,
            "latency_ms": 723.0,
            "usd": 0.001191,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "barolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "barolo",
            "expected": "barolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 372,
            "output_tokens": 5,
            "latency_ms": 1516.8,
            "usd": 0.001191,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 372.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 987.3,
        "usd_sum": 0.003573
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-ja-001",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "easy",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "ff1f43723314705c",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nOGSの香りファミリー一覧から、柚子（ゆず）の主たる香りファミリーを一語で答えてください。\n\nINPUT:\n{\n  \"item\": \"柚子\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "fruit"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "fruit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fruit",
            "expected": "fruit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 384,
            "output_tokens": 4,
            "latency_ms": 1626.3,
            "usd": 0.001212,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "fruit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fruit",
            "expected": "fruit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 384,
            "output_tokens": 4,
            "latency_ms": 1435.5,
            "usd": 0.001212,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "fruit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fruit",
            "expected": "fruit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 384,
            "output_tokens": 4,
            "latency_ms": 1435.6,
            "usd": 0.001212,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 384.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1499.133,
        "usd_sum": 0.003636
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:ml-ja-002",
      "category": "pairing",
      "capability": "beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.beverage_types",
        "vocab.match_types"
      ],
      "prompt_template_hash": "f68d4882814e0ecf",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\n握り寿司（赤身・白身・光物）のコース全体に最も安全に合う飲料のタイプを、次から一つ選んでください: sake, red_wine, stout_beer, bourbon.\n\nINPUT:\n{\n  \"dish\": \"握り寿司コース\"\n}\n\nANSWER FORMAT: exact:one of the given options",
      "expected": {
        "value": "sake"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "sake",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sake",
            "expected": "sake",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 388,
            "output_tokens": 4,
            "latency_ms": 912.9,
            "usd": 0.001224,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "sake",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sake",
            "expected": "sake",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 388,
            "output_tokens": 4,
            "latency_ms": 716.7,
            "usd": 0.001224,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "sake",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sake",
            "expected": "sake",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 388,
            "output_tokens": 4,
            "latency_ms": 1006.8,
            "usd": 0.001224,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 388.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 878.8,
        "usd_sum": 0.003672
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-001",
      "category": "nutrition",
      "capability": "energy_calculation",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "94fadc6f5c8c5968",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nA portion contains 40 g carbohydrate, 20 g protein, 15 g fat. Return total energy in kcal as a single integer, using the EU Annex XIV macro factors.\n\nINPUT:\n{\n  \"carb_g\": 40,\n  \"protein_g\": 20,\n  \"fat_g\": 15\n}\n\nANSWER FORMAT: numeric:integer kcal",
      "expected": {
        "value": 375,
        "tolerance": 5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Carbs: 40 × 4 = 160 kcal\nProtein: 20 × 4 = 80 kcal\nFat: 15 × 9 = 135 kcal\nTotal: 160 + 80 + 135 = 375 kcal\n\n375",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 40.0,
            "expected": 375.0,
            "tolerance": 5.0
          },
          "metrics": {
            "input_tokens": 1018,
            "output_tokens": 79,
            "latency_ms": 2021.2,
            "usd": 0.004239,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Carbs: 40 × 4 = 160 kcal\nProtein: 20 × 4 = 80 kcal\nFat: 15 × 9 = 135 kcal\nTotal: 160 + 80 + 135 = 375 kcal\n\n375",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 40.0,
            "expected": 375.0,
            "tolerance": 5.0
          },
          "metrics": {
            "input_tokens": 1018,
            "output_tokens": 79,
            "latency_ms": 1840.0,
            "usd": 0.004239,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Carbs: 40 × 4 = 160 kcal\nProtein: 20 × 4 = 80 kcal\nFat: 15 × 9 = 135 kcal\nTotal: 160 + 80 + 135 = 375 kcal\n\n375",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 40.0,
            "expected": 375.0,
            "tolerance": 5.0
          },
          "metrics": {
            "input_tokens": 1018,
            "output_tokens": 79,
            "latency_ms": 2264.8,
            "usd": 0.004239,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1018.0,
        "output_tokens_mean": 79.0,
        "latency_ms_mean": 2042.0,
        "usd_sum": 0.012717
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-002",
      "category": "nutrition",
      "capability": "energy_calculation",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "4b141daaadd9af57",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nA 150 mL glass of red wine contains approximately 14 g alcohol, 4 g carbohydrate, no protein, no fat. Return energy in kcal as a single integer.\n\nINPUT:\n{\n  \"carb_g\": 4,\n  \"protein_g\": 0,\n  \"fat_g\": 0,\n  \"alcohol_g\": 14\n}\n\nANSWER FORMAT: numeric:integer kcal",
      "expected": {
        "value": 114,
        "tolerance": 5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "14×7 = 98 kcal (alcohol) + 4×4 = 16 kcal (carb) = 114 kcal\n\n114",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 14.0,
            "expected": 114.0,
            "tolerance": 5.0
          },
          "metrics": {
            "input_tokens": 1026,
            "output_tokens": 44,
            "latency_ms": 2195.0,
            "usd": 0.003738,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "14×7 = 98 kcal from alcohol\n4×4 = 16 kcal from carbohydrate\nTotal = 114 kcal\n\n114",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 14.0,
            "expected": 114.0,
            "tolerance": 5.0
          },
          "metrics": {
            "input_tokens": 1026,
            "output_tokens": 45,
            "latency_ms": 2313.3,
            "usd": 0.003753,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "14×7 = 98 kcal (alcohol) + 4×4 = 16 kcal (carbs) = 114 kcal\n\n114",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 14.0,
            "expected": 114.0,
            "tolerance": 5.0
          },
          "metrics": {
            "input_tokens": 1026,
            "output_tokens": 44,
            "latency_ms": 2750.7,
            "usd": 0.003738,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1026.0,
        "output_tokens_mean": 44.333,
        "latency_ms_mean": 2419.667,
        "usd_sum": 0.011229
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-003",
      "category": "nutrition",
      "capability": "per_100g_estimate",
      "difficulty": "medium",
      "cuisine_region": "mediterranean",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "518ee70ef096f9ec",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nA simple olive-oil vinaigrette is 75 g olive oil + 25 g red wine vinegar (vinegar ≈ 20 kcal/100 g). Return energy per 100 g of finished dressing as a single integer kcal.\n\nINPUT:\n{\n  \"olive_oil_g\": 75,\n  \"vinegar_g\": 25\n}\n\nANSWER FORMAT: numeric:integer kcal per 100 g",
      "expected": {
        "value": 668,
        "tolerance": 15
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Let me calculate the energy per 100 g of finished dressing.\n\n**Olive oil (75 g):**\n75 × (884/100) = 75 × 8.84 = 663 kcal\n\n**Red wine vinegar (25 g):**\n25 × (20/100) = 25 × 0.20 = 5 kcal\n\n**Total kcal in 100 g finished dressing (75 + 25 = 100 g):**\n663 + 5 = 668 kcal\n\n668",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 100.0,
            "expected": 668.0,
            "tolerance": 15.0
          },
          "metrics": {
            "input_tokens": 1031,
            "output_tokens": 137,
            "latency_ms": 2612.3,
            "usd": 0.005148,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "I need to calculate the energy per 100g of the finished vinaigrette.\n\n**Ingredients:**\n- 75g olive oil: (75/100) × 884 = 663 kcal\n- 25g red wine vinegar: (25/100) × 20 = 5 kcal\n\n**Total mass:** 75 + 25 = 100g\n**Total kcal:** 663 + 5 = 668 kcal\n\nSince the finished mass is already 100g, the energy per 100g = **668**\n\n668",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 100.0,
            "expected": 668.0,
            "tolerance": 15.0
          },
          "metrics": {
            "input_tokens": 1031,
            "output_tokens": 136,
            "latency_ms": 2995.0,
            "usd": 0.005133,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "I need to calculate the energy per 100 g of the finished vinaigrette.\n\n**Olive oil (75 g):**\n- 884 kcal/100 g → 75 g = 663 kcal\n\n**Red wine vinegar (25 g):**\n- 20 kcal/100 g → 25 g = 5 kcal\n\n**Total energy:** 663 + 5 = 668 kcal\n\n**Total mass:** 75 + 25 = 100 g\n\n**Per 100 g:** 668 / 100 × 100 = 668 kcal\n\n668",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 100.0,
            "expected": 668.0,
            "tolerance": 15.0
          },
          "metrics": {
            "input_tokens": 1031,
            "output_tokens": 157,
            "latency_ms": 4368.1,
            "usd": 0.005448,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1031.0,
        "output_tokens_mean": 143.333,
        "latency_ms_mean": 3325.133,
        "usd_sum": 0.015729
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-004",
      "category": "nutrition",
      "capability": "protein_estimate",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "42947503c82b308d",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nEstimate the protein (g) in a 180 g cooked portion of skinless chicken-breast filet assuming 30% cooking mass loss. Respond with one integer.\n\nINPUT:\n{\n  \"raw_mass_g_target_cooked\": 180,\n  \"cooking_loss\": 0.3\n}\n\nANSWER FORMAT: numeric:integer g",
      "expected": {
        "value": 59,
        "tolerance": 4
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "To get 180 g cooked, I need to find the raw mass:\n\nRaw mass = 180 / (1 - 0.3) = 180 / 0.7 ≈ 257.14 g\n\nProtein in raw chicken breast: 23 g per 100 g\n\nTotal protein = 257.14 × 23 / 100 ≈ 59.14 g\n\nProtein is retained (not lost with water mass), so the protein in the 180 g cooked portion is approximately 59 g.\n\n59",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 180.0,
            "expected": 59.0,
            "tolerance": 4.0
          },
          "metrics": {
            "input_tokens": 1006,
            "output_tokens": 134,
            "latency_ms": 4526.8,
            "usd": 0.005028,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "I need to find the protein in a 180 g cooked portion, where the chicken experienced 30% cooking mass loss.\n\n**Step 1: Find the raw mass that yields 180 g cooked.**\n\nIf cooked mass = raw mass × (1 - 0.30) = raw mass × 0.70\n\nRaw mass = 180 / 0.70 = 257.14 g\n\n**Step 2: Calculate total protein from raw chicken.**\n\nChicken breast raw has 23 g protein per 100 g.\n\nTotal protein = 257.14 × 23 / 100 = 59.14 g\n\n**Step 3: Protein doesn't evaporate — it stays in the cooked meat.**\n\nProtein in 180 g cooked portion = 59.14 g ≈ **59** g\n\n59",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 180.0,
            "expected": 59.0,
            "tolerance": 4.0
          },
          "metrics": {
            "input_tokens": 1006,
            "output_tokens": 199,
            "latency_ms": 5057.6,
            "usd": 0.006003,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "To get 180 g cooked, I need to find the raw mass:\n\nraw_mass = 180 / (1 - 0.3) = 180 / 0.7 ≈ 257.14 g\n\nProtein in raw chicken breast: 23 g per 100 g\n\nTotal protein = 257.14 × 23 / 100 ≈ 59.14 g\n\nProtein doesn't evaporate with water loss, so all protein remains in the cooked portion.\n\n**59**",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 180.0,
            "expected": 59.0,
            "tolerance": 4.0
          },
          "metrics": {
            "input_tokens": 1006,
            "output_tokens": 128,
            "latency_ms": 3223.5,
            "usd": 0.004938,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1006.0,
        "output_tokens_mean": 153.667,
        "latency_ms_mean": 4269.3,
        "usd_sum": 0.015969
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-005",
      "category": "nutrition",
      "capability": "macro_vector",
      "difficulty": "hard",
      "cuisine_region": "italian",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "375ca5d44d346632",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nA portion contains 85 g dry pasta (75 C / 13 P / 1.5 F per 100 g), 10 g butter (0.1 C / 0.9 P / 81 F per 100 g) and 15 g parmesan (3.2 C / 35 P / 29 F per 100 g). Return [carb_g, protein_g, fat_g, kcal] for the portion as a JSON array of four numbers.\n\nINPUT:\n{\n  \"pasta_g\": 85,\n  \"butter_g\": 10,\n  \"parmesan_g\": 15\n}\n\nANSWER FORMAT: json:array [carb, protein, fat, kcal]",
      "expected": {
        "values": [
          64.3,
          16.3,
          13.7,
          446
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "I'll calculate the macros for each ingredient, then sum them up.\n\n**Dry pasta (85 g):**\n- Carbs: 85 × 0.75 = 63.75 g\n- Protein: 85 × 0.13 = 11.05 g\n- Fat: 85 × 0.015 = 1.275 g\n\n**Butter (10 g):**\n- Carbs: 10 × 0.001 = 0.01 g\n- Protein: 10 × 0.009 = 0.09 g\n- Fat: 10 × 0.81 = 8.1 g\n\n**Parmesan (15 g):**\n- Carbs: 15 × 0.032 = 0.48 g\n- Protein: 15 × 0.35 = 5.25 g\n- Fat: 15 × 0.29 = 4.35 g\n\n**Totals:**\n- Carbs: 63.75 + 0.01 + 0.48 = 64.24 g\n- Protein: 11.05 + 0.09 + 5.25 = 16.39 g\n- Fat: 1.275 + 8.1 + 4.35 = 13.725 g\n\n**kcal:**\n- Carbs: 64.24 × 4 = 256.96\n- Protein: 16.39 × 4 = 65.56\n- Fat: 13.725 × 9 = 123.525\n- Total: 256.96 + 65.56 + 123.525 = 446.045 → **446 kcal**\n\n[64.24, 16.39, 13.725, 446]",
          "compliant": true,
          "score": 0.9854,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 64.24,
                "expected": 64.3,
                "score": 0.98
              },
              {
                "index": 1,
                "predicted": 16.39,
                "expected": 16.3,
                "score": 0.97
              },
              {
                "index": 2,
                "predicted": 13.725,
                "expected": 13.7,
                "score": 0.9917
              },
              {
                "index": 3,
                "predicted": 446,
                "expected": 446,
                "score": 1.0
              }
            ],
            "tolerance": 3.0
          },
          "metrics": {
            "input_tokens": 1102,
            "output_tokens": 450,
            "latency_ms": 5213.2,
            "usd": 0.010056,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "I'll calculate the macros for each ingredient scaled by mass, then sum them.\n\n**Pasta (85 g):**\n- Carbs: 85 × 0.75 = 63.75 g\n- Protein: 85 × 0.13 = 11.05 g\n- Fat: 85 × 0.015 = 1.275 g\n\n**Butter (10 g):**\n- Carbs: 10 × 0.001 = 0.01 g\n- Protein: 10 × 0.009 = 0.09 g\n- Fat: 10 × 0.81 = 8.1 g\n\n**Parmesan (15 g):**\n- Carbs: 15 × 0.032 = 0.48 g\n- Protein: 15 × 0.35 = 5.25 g\n- Fat: 15 × 0.29 = 4.35 g\n\n**Totals:**\n- Carbs: 63.75 + 0.01 + 0.48 = 64.24 g\n- Protein: 11.05 + 0.09 + 5.25 = 16.39 g\n- Fat: 1.275 + 8.1 + 4.35 = 13.725 g\n\n**kcal:**\n- Carbs: 64.24 × 4 = 256.96\n- Protein: 16.39 × 4 = 65.56\n- Fat: 13.725 × 9 = 123.525\n- Total: 256.96 + 65.56 + 123.525 = 446.045 → **446 kcal**\n\n[64.24, 16.39, 13.725, 446]",
          "compliant": true,
          "score": 0.9854,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 64.24,
                "expected": 64.3,
                "score": 0.98
              },
              {
                "index": 1,
                "predicted": 16.39,
                "expected": 16.3,
                "score": 0.97
              },
              {
                "index": 2,
                "predicted": 13.725,
                "expected": 13.7,
                "score": 0.9917
              },
              {
                "index": 3,
                "predicted": 446,
                "expected": 446,
                "score": 1.0
              }
            ],
            "tolerance": 3.0
          },
          "metrics": {
            "input_tokens": 1102,
            "output_tokens": 451,
            "latency_ms": 5514.0,
            "usd": 0.010071,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "I'll calculate macros for each ingredient at their given amounts:\n\n**Pasta (85 g):**\n- Carb: 85 × 0.75 = 63.75 g\n- Protein: 85 × 0.13 = 11.05 g\n- Fat: 85 × 0.015 = 1.275 g\n\n**Butter (10 g):**\n- Carb: 10 × 0.001 = 0.01 g\n- Protein: 10 × 0.009 = 0.09 g\n- Fat: 10 × 0.81 = 8.1 g\n\n**Parmesan (15 g):**\n- Carb: 15 × 0.032 = 0.48 g\n- Protein: 15 × 0.35 = 5.25 g\n- Fat: 15 × 0.29 = 4.35 g\n\n**Totals:**\n- Carb: 63.75 + 0.01 + 0.48 = 64.24 g\n- Protein: 11.05 + 0.09 + 5.25 = 16.39 g\n- Fat: 1.275 + 8.1 + 4.35 = 13.725 g\n\n**kcal:**\n- Carb: 64.24 × 4 = 256.96\n- Protein: 16.39 × 4 = 65.56\n- Fat: 13.725 × 9 = 123.525\n- Total: 256.96 + 65.56 + 123.525 = 446.045 → **446 kcal**\n\n[64.24, 16.39, 13.725, 446]",
          "compliant": true,
          "score": 0.9854,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 64.24,
                "expected": 64.3,
                "score": 0.98
              },
              {
                "index": 1,
                "predicted": 16.39,
                "expected": 16.3,
                "score": 0.97
              },
              {
                "index": 2,
                "predicted": 13.725,
                "expected": 13.7,
                "score": 0.9917
              },
              {
                "index": 3,
                "predicted": 446,
                "expected": 446,
                "score": 1.0
              }
            ],
            "tolerance": 3.0
          },
          "metrics": {
            "input_tokens": 1102,
            "output_tokens": 447,
            "latency_ms": 5468.5,
            "usd": 0.010011,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.9854,
      "score_summary": {
        "mean": 0.9854,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.9854,
        "ci95_hi": 0.9854,
        "n": 3
      },
      "score_if_valid": 0.9854,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1102.0,
        "output_tokens_mean": 449.333,
        "latency_ms_mean": 5398.567,
        "usd_sum": 0.030138
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-006",
      "category": "nutrition",
      "capability": "energy_calculation",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "1ba6175ed99ad486",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nA mocktail uses 180 mL fresh orange juice (approx 9 g carbohydrate per 100 g) plus 12 g granulated sugar. Return kcal as a single integer.\n\nINPUT:\n{\n  \"oj_ml\": 180,\n  \"sugar_g\": 12\n}\n\nANSWER FORMAT: numeric:integer kcal",
      "expected": {
        "value": 111,
        "tolerance": 6
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "I need to calculate the total kcal for the mocktail, then express it per 100g of finished component... but the task says \"Return kcal as a single integer\" without specifying per 100g. Let me re-read.\n\n\"Return kcal\" - this seems to be total kcal for the drink.\n\n**Orange juice:**\n- 180 mL ≈ 180 g (assuming density ≈ 1 g/mL)\n- Carbohydrate: 9 g per 100 g → 180 × 9/100 = 16.2 g carbs\n- Protein and fat are minimal (OJ not listed in common ingredients, so use provided: 9g carb/100g, assume ~0.7g protein, ~0.2g fat per 100g for OJ, but task only gives carbohydrate)\n- Since only carbohydrate is specified (9g/100g), I'll use just that for OJ.\n- OJ kcal: 16.2 g × 4 kcal/g = 64.8 kcal\n\n**Sugar:**\n- 12 g granulated sugar\n- 100% carbohydrate: 12 × 4 = 48 kcal\n\n**Total kcal:**\n64.8 + 48 = 112.8 ≈ **113**\n\n113",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 100.0,
            "expected": 111.0,
            "tolerance": 6.0
          },
          "metrics": {
            "input_tokens": 1010,
            "output_tokens": 335,
            "latency_ms": 6928.0,
            "usd": 0.008055,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "I need to calculate the total kcal for the mocktail.\n\n**Assumptions:**\n- Fresh orange juice density ≈ 1.04 g/mL (commonly used), but for simplicity I'll use 1 g/mL as a standard approximation unless stated otherwise. Actually, orange juice is approximately 1.04 g/mL, but the problem doesn't specify density explicitly. I'll use 1.04 g/mL.\n\nWait, let me re-read: \"180 mL fresh orange juice (approx 9 g carbohydrate per 100 g)\". I'll use density of ~1.04 g/mL for OJ.\n\nActually, for simplicity in OGS calculations without explicit density, let me use 1.04 g/mL as standard for juice.\n\n**Orange juice:**\n- Mass = 180 mL × 1.04 g/mL = 187.2 g\n- Carbohydrates: 9 g per 100 g → 187.2 × 0.09 = 16.848 g carbs\n- Protein and fat: negligible (OJ is primarily carbs/water)\n- kcal from OJ = 16.848 × 4 = 67.39 kcal\n\nHmm, but actually without explicit density, perhaps simpler to use 1 g/mL:\n\n**With 1 g/mL:**\n- Mass = 180 g\n- Carbs = 180 × 0.09 = 16.2 g\n- kcal = 16.2 × 4 = 64.8 kcal\n\n**Sugar:**\n- 12 g × (100 C / 100) = 12 g carbs\n- kcal = 12 × 4 = 48 kcal\n\n**Total (1 g/mL):** 64.8 + 48 = 112.8 → **113 kcal**\n\n**Total (1.04 g/mL):** 67.39 + 48 = 115.39 → **115 kcal**\n\nGiven OGS doesn't specify OJ density and the problem uses \"mL\" without a conversion factor, I'll use the simpler 1 g/mL assumption.\n\n**113**",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 1.04,
            "expected": 111.0,
            "tolerance": 6.0
          },
          "metrics": {
            "input_tokens": 1010,
            "output_tokens": 557,
            "latency_ms": 9104.8,
            "usd": 0.011385,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "I need to calculate the total kcal for the mocktail.\n\n**Assumptions:**\n- Orange juice density ≈ 1.04 g/mL (but for simplicity, I'll use 1 g/mL as a common approximation... actually let me use the standard ~1.04 g/mL for juice)\n\nWait, the problem doesn't specify density. I'll use 1.05 g/mL for orange juice (a common approximation), but actually let me reconsider — many OGS problems use 1 g/mL for simplicity unless stated. I'll use 1.05 g/mL since OJ is slightly denser than water.\n\nActually, to keep it straightforward and since no density is given explicitly, I'll use **1 g/mL** (water approximation).\n\n**Orange juice:**\n- Mass = 180 g\n- Carbs = 9 g per 100 g → 180 × 0.09 = 16.2 g carbs\n- Protein and fat negligible (OJ is essentially carbs + water)\n- kcal from OJ = 16.2 × 4 = 64.8 kcal\n\n**Sugar:**\n- 12 g × (387 kcal/100 g) = 12 × 3.87 = 46.44 kcal\n- Or using 4 kcal/g for carbohydrate: 12 × 4 = 48 kcal\n- Sugar is 100 g carb per 100 g, so energy = 12 × 4 = 48 kcal\n\n**Total kcal:**\n- 64.8 + 48 = 112.8 ≈ **113 kcal**\n\n113",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 1.04,
            "expected": 111.0,
            "tolerance": 6.0
          },
          "metrics": {
            "input_tokens": 1010,
            "output_tokens": 428,
            "latency_ms": 8252.9,
            "usd": 0.00945,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1010.0,
        "output_tokens_mean": 440.0,
        "latency_ms_mean": 8095.233,
        "usd_sum": 0.02889
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-007",
      "category": "nutrition",
      "capability": "fibre_rule",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "b9e5625ed0828c94",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nWhat kcal/g factor does the OGS nutrition reference assign to dietary fibre? Respond with a single number.\n\nANSWER FORMAT: numeric:decimal kcal/g",
      "expected": {
        "value": 2,
        "tolerance": 0.1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "2",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.0,
            "expected": 2.0,
            "tolerance": 0.1
          },
          "metrics": {
            "input_tokens": 964,
            "output_tokens": 5,
            "latency_ms": 936.4,
            "usd": 0.002967,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "2",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.0,
            "expected": 2.0,
            "tolerance": 0.1
          },
          "metrics": {
            "input_tokens": 964,
            "output_tokens": 5,
            "latency_ms": 1084.4,
            "usd": 0.002967,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "2",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.0,
            "expected": 2.0,
            "tolerance": 0.1
          },
          "metrics": {
            "input_tokens": 964,
            "output_tokens": 5,
            "latency_ms": 1204.3,
            "usd": 0.002967,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 964.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1075.033,
        "usd_sum": 0.008901
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-008",
      "category": "nutrition",
      "capability": "kj_conversion",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "95ef839d58e96bbc",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nConvert 250 kcal to kJ using the canonical OGS/EU conversion factor. Respond with a single integer.\n\nINPUT:\n{\n  \"kcal\": 250\n}\n\nANSWER FORMAT: numeric:integer kJ",
      "expected": {
        "value": 1046,
        "tolerance": 10
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "1046",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 1046.0,
            "expected": 1046.0,
            "tolerance": 10.0
          },
          "metrics": {
            "input_tokens": 978,
            "output_tokens": 6,
            "latency_ms": 956.4,
            "usd": 0.003024,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "1046",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 1046.0,
            "expected": 1046.0,
            "tolerance": 10.0
          },
          "metrics": {
            "input_tokens": 978,
            "output_tokens": 6,
            "latency_ms": 983.9,
            "usd": 0.003024,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "1046",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 1046.0,
            "expected": 1046.0,
            "tolerance": 10.0
          },
          "metrics": {
            "input_tokens": 978,
            "output_tokens": 6,
            "latency_ms": 885.7,
            "usd": 0.003024,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 978.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 942.0,
        "usd_sum": 0.009072
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-009",
      "category": "nutrition",
      "capability": "macro_vector",
      "difficulty": "hard",
      "cuisine_region": "italian",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "641416c8b33c6dbc",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nA simple Alfredo sauce is 100 g butter + 150 g heavy cream (36% fat: 2.8 C / 2 P / 37 F) + 60 g parmesan (3.2 C / 35 P / 29 F), reduced to 250 g finished weight. Return [carb_per100g, protein_per100g, fat_per100g, kcal_per100g] as a JSON array of four numbers.\n\nINPUT:\n{\n  \"butter_g\": 100,\n  \"cream_g\": 150,\n  \"parmesan_g\": 60,\n  \"finished_g\": 250\n}\n\nANSWER FORMAT: json:array [carb, protein, fat, kcal] per 100 g",
      "expected": {
        "values": [
          2.5,
          9.6,
          55.2,
          545
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "I'll calculate the total macros for all ingredients, then scale to per 100g of finished sauce.\n\n**Step 1: Total macros for each ingredient**\n\nButter (100g): 0.1 C, 0.9 P, 81 F\n- C: 0.1g, P: 0.9g, F: 81g\n\nHeavy cream (150g): 2.8 C / 2 P / 37 F per 100g\n- C: 4.2g, P: 3.0g, F: 55.5g\n\nParmesan (60g): 3.2 C / 35 P / 29 F per 100g\n- C: 1.92g, P: 21.0g, F: 17.4g\n\n**Step 2: Sum totals**\n- Total C: 0.1 + 4.2 + 1.92 = 6.22g\n- Total P: 0.9 + 3.0 + 21.0 = 24.9g\n- Total F: 81 + 55.5 + 17.4 = 153.9g\n\n**Step 3: Per 100g of finished sauce (250g)**\n\n- C: 6.22 / 250 × 100 = 2.488g\n- P: 24.9 / 250 × 100 = 9.96g\n- F: 153.9 / 250 × 100 = 61.56g\n\n**Step 4: kcal per 100g**\n- C: 2.488 × 4 = 9.952\n- P: 9.96 × 4 = 39.84\n- F: 61.56 × 9 = 554.04\n- Total: 9.952 + 39.84 + 554.04 = 603.832 → **604 kcal**\n\n[2.49, 9.96, 61.56, 604]",
          "compliant": true,
          "score": 0.4769,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 2.49,
                "expected": 2.5,
                "score": 0.9975
              },
              {
                "index": 1,
                "predicted": 9.96,
                "expected": 9.6,
                "score": 0.91
              },
              {
                "index": 2,
                "predicted": 61.56,
                "expected": 55.2,
                "score": 0.0
              },
              {
                "index": 3,
                "predicted": 604,
                "expected": 545,
                "score": 0.0
              }
            ],
            "tolerance": 4.0
          },
          "metrics": {
            "input_tokens": 1109,
            "output_tokens": 492,
            "latency_ms": 6410.7,
            "usd": 0.010707,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "I need to calculate the total macros for each ingredient, sum them, then divide by finished weight to get per-100g values.\n\n**Butter (100g):** 0.1C / 0.9P / 81F\n- Carbs: 0.1g\n- Protein: 0.9g\n- Fat: 81g\n\n**Heavy Cream (150g):** 2.8C / 2P / 37F per 100g\n- Carbs: 2.8 × 1.5 = 4.2g\n- Protein: 2.0 × 1.5 = 3.0g\n- Fat: 37 × 1.5 = 55.5g\n\n**Parmesan (60g):** 3.2C / 35P / 29F per 100g\n- Carbs: 3.2 × 0.6 = 1.92g\n- Protein: 35 × 0.6 = 21.0g\n- Fat: 29 × 0.6 = 17.4g\n\n**Totals (raw):**\n- Carbs: 0.1 + 4.2 + 1.92 = 6.22g\n- Protein: 0.9 + 3.0 + 21.0 = 24.9g\n- Fat: 81 + 55.5 + 17.4 = 153.9g\n\n**Per 100g of finished sauce (250g finished):**\n- Carbs: 6.22 / 250 × 100 = 2.488g\n- Protein: 24.9 / 250 × 100 = 9.96g\n- Fat: 153.9 / 250 × 100 = 61.56g\n\n**kcal per 100g:**\n- Carbs: 2.488 × 4 = 9.952\n- Protein: 9.96 × 4 = 39.84\n- Fat: 61.56 × 9 = 554.04\n- Total: 9.952 + 39.84 + 554.04 = 603.832 → **604 kcal**\n\n[2.49, 9.96, 61.56, 604]",
          "compliant": true,
          "score": 0.4769,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 2.49,
                "expected": 2.5,
                "score": 0.9975
              },
              {
                "index": 1,
                "predicted": 9.96,
                "expected": 9.6,
                "score": 0.91
              },
              {
                "index": 2,
                "predicted": 61.56,
                "expected": 55.2,
                "score": 0.0
              },
              {
                "index": 3,
                "predicted": 604,
                "expected": 545,
                "score": 0.0
              }
            ],
            "tolerance": 4.0
          },
          "metrics": {
            "input_tokens": 1109,
            "output_tokens": 563,
            "latency_ms": 7545.1,
            "usd": 0.011772,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "I'll calculate the total macros for each ingredient, then scale to per-100g of finished sauce.\n\n**Step 1: Total macros per ingredient (in grams of macro)**\n\nButter (100 g): 0.1 C, 0.9 P, 81 F\nHeavy cream (150 g): 2.8×1.5=4.2 C, 2×1.5=3.0 P, 37×1.5=55.5 F\nParmesan (60 g): 3.2×0.6=1.92 C, 35×0.6=21.0 P, 29×0.6=17.4 F\n\n**Step 2: Totals across all ingredients**\n\n- Carbs: 0.1 + 4.2 + 1.92 = 6.22 g\n- Protein: 0.9 + 3.0 + 21.0 = 24.9 g\n- Fat: 81 + 55.5 + 17.4 = 153.9 g\n\n**Step 3: Per 100 g of finished sauce (finished = 250 g)**\n\n- Carbs: 6.22 / 250 × 100 = 2.488 g\n- Protein: 24.9 / 250 × 100 = 9.96 g\n- Fat: 153.9 / 250 × 100 = 61.56 g\n\n**Step 4: kcal per 100 g**\n\n(2.488 × 4) + (9.96 × 4) + (61.56 × 9)\n= 9.952 + 39.84 + 554.04\n= 603.832 → **604 kcal**\n\n[2.49, 9.96, 61.56, 604]",
          "compliant": true,
          "score": 0.4769,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 2.49,
                "expected": 2.5,
                "score": 0.9975
              },
              {
                "index": 1,
                "predicted": 9.96,
                "expected": 9.6,
                "score": 0.91
              },
              {
                "index": 2,
                "predicted": 61.56,
                "expected": 55.2,
                "score": 0.0
              },
              {
                "index": 3,
                "predicted": 604,
                "expected": 545,
                "score": 0.0
              }
            ],
            "tolerance": 4.0
          },
          "metrics": {
            "input_tokens": 1109,
            "output_tokens": 459,
            "latency_ms": 6187.8,
            "usd": 0.010212,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.4769,
      "score_summary": {
        "mean": 0.4769,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.4769,
        "ci95_hi": 0.4769,
        "n": 3
      },
      "score_if_valid": 0.4769,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1109.0,
        "output_tokens_mean": 504.667,
        "latency_ms_mean": 6714.533,
        "usd_sum": 0.032691
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:nut-010",
      "category": "nutrition",
      "capability": "numeric_compare",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.nutrition"
      ],
      "prompt_template_hash": "fd7edc20921469ff",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS nutrition reference (per-100 g macro densities)\n\nMacronutrient energy values (EU Regulation 1169/2011 Annex XIV):\n- carbohydrate: **4 kcal/g** (17 kJ/g)\n- protein: **4 kcal/g** (17 kJ/g)\n- fat: **9 kcal/g** (37 kJ/g)\n- alcohol: **7 kcal/g** (29 kJ/g)\n- fibre: **2 kcal/g** (8 kJ/g)\n\nExpress nutrition values per 100 g of the finished component unless explicitly stated otherwise. Round kcal to the nearest integer.\n\nCommon ingredient macros (per 100 g, approximate, for reasoning only):\n- olive oil: 0 C / 0 P / 100 F → 884 kcal\n- butter: 0.1 C / 0.9 P / 81 F → 717 kcal\n- chicken breast (raw, skinless): 0 C / 23 P / 1.2 F → 110 kcal\n- salmon (raw, Atlantic): 0 C / 20 P / 13 F → 208 kcal\n- dry pasta: 75 C / 13 P / 1.5 F → 371 kcal\n- cooked white rice: 28 C / 2.7 P / 0.3 F → 130 kcal\n- granulated sugar: 100 C / 0 P / 0 F → 387 kcal\n- all-purpose flour: 76 C / 10 P / 1 F → 364 kcal\n- whole milk: 4.8 C / 3.4 P / 3.3 F → 61 kcal\n- heavy cream (36%): 2.8 C / 2 P / 37 F → 345 kcal\n- eggs (whole, raw): 1.1 C / 12.6 P / 9.5 F → 143 kcal\n- parmesan: 3.2 C / 35 P / 29 F → 420 kcal\n- tomato (raw): 3.9 C / 0.9 P / 0.2 F → 18 kcal\n- onion (raw): 9 C / 1.1 P / 0.1 F → 40 kcal\n\nWhen summing nutrition across a recipe, scale each ingredient by its mass in grams, then divide by total finished mass to get per-100 g values. Assume no mass loss from evaporation unless the task says otherwise.\n\n---\n\nFrom the set {olive oil, butter, granulated sugar, dry pasta}, return the ingredient with the highest kcal per gram. Respond with one short name.\n\nINPUT:\n{\n  \"set\": [\n    \"olive oil\",\n    \"butter\",\n    \"granulated sugar\",\n    \"dry pasta\"\n  ]\n}\n\nANSWER FORMAT: exact:string",
      "expected": {
        "value": "olive oil",
        "aliases": [
          "olive-oil"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "olive oil",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "olive oil",
            "expected": "olive oil",
            "aliases": [
              "olive-oil"
            ]
          },
          "metrics": {
            "input_tokens": 1007,
            "output_tokens": 5,
            "latency_ms": 2057.2,
            "usd": 0.003096,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "olive oil",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "olive oil",
            "expected": "olive oil",
            "aliases": [
              "olive-oil"
            ]
          },
          "metrics": {
            "input_tokens": 1007,
            "output_tokens": 5,
            "latency_ms": 1371.3,
            "usd": 0.003096,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "olive oil",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "olive oil",
            "expected": "olive oil",
            "aliases": [
              "olive-oil"
            ]
          },
          "metrics": {
            "input_tokens": 1007,
            "output_tokens": 5,
            "latency_ms": 1215.5,
            "usd": 0.003096,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1007.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1548.0,
        "usd_sum": 0.009288
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-001",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "easy",
      "cuisine_region": "american",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "3fa6a67d5bfe7f1a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage option for the dish below from the candidates. Respond with just the candidate id (e.g. 'a').\n\nINPUT:\n{\n  \"dish\": \"60-day dry-aged ribeye, medium-rare, served with peppercorn jus\",\n  \"candidates\": {\n    \"a\": \"young Barolo DOCG (high tannin, high acidity, full body)\",\n    \"b\": \"off-dry German Kabinett Riesling\",\n    \"c\": \"unoaked Muscadet sur lie\",\n    \"d\": \"sweet Pedro Ximenez sherry\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "b",
          "d",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 441,
            "output_tokens": 4,
            "latency_ms": 1485.1,
            "usd": 0.001383,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 441,
            "output_tokens": 4,
            "latency_ms": 1034.6,
            "usd": 0.001383,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 441,
            "output_tokens": 4,
            "latency_ms": 1120.8,
            "usd": 0.001383,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 441.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1213.5,
        "usd_sum": 0.004149
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-002",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "4b85899cd6206acb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below from the candidates. Respond with just the candidate id.\n\nINPUT:\n{\n  \"dish\": \"fresh Kumamoto oysters on the half shell, lemon and mignonette\",\n  \"candidates\": {\n    \"a\": \"oaked California Chardonnay (14.5% ABV, buttery)\",\n    \"b\": \"Muscadet Sevre-et-Maine sur lie\",\n    \"c\": \"Amarone della Valpolicella\",\n    \"d\": \"bourbon old fashioned\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "b",
        "ranking": [
          "b",
          "a",
          "d",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 424,
            "output_tokens": 4,
            "latency_ms": 688.5,
            "usd": 0.001332,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 424,
            "output_tokens": 4,
            "latency_ms": 833.0,
            "usd": 0.001332,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 424,
            "output_tokens": 4,
            "latency_ms": 859.0,
            "usd": 0.001332,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 424.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 793.5,
        "usd_sum": 0.003996
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-003",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "mexican",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "41ed3ee840823502",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below. Respond with just the candidate id.\n\nINPUT:\n{\n  \"dish\": \"classic Chengdu-style mapo tofu (heavy doubanjiang, Sichuan peppercorn, ground pork, high chilli heat)\",\n  \"candidates\": {\n    \"a\": \"15% ABV Zinfandel\",\n    \"b\": \"off-dry Riesling Kabinett (~10% ABV)\",\n    \"c\": \"Islay single malt Scotch (Lagavulin 16)\",\n    \"d\": \"Brunello di Montalcino Riserva\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "b",
        "ranking": [
          "b",
          "d",
          "a",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 449,
            "output_tokens": 4,
            "latency_ms": 1210.3,
            "usd": 0.001407,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 449,
            "output_tokens": 4,
            "latency_ms": 722.7,
            "usd": 0.001407,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 449,
            "output_tokens": 4,
            "latency_ms": 807.4,
            "usd": 0.001407,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 449.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 913.467,
        "usd_sum": 0.004221
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-004",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "82a4b41d0e3d5668",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below. Respond with just the candidate id.\n\nINPUT:\n{\n  \"dish\": \"warm goat cheese crouton on mesclun with walnut vinaigrette\",\n  \"candidates\": {\n    \"a\": \"Sancerre (Sauvignon Blanc, Loire)\",\n    \"b\": \"Napa Cabernet Sauvignon Reserve\",\n    \"c\": \"Vintage Port\",\n    \"d\": \"lagered pilsner\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "d",
          "b",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 410,
            "output_tokens": 4,
            "latency_ms": 918.8,
            "usd": 0.00129,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 410,
            "output_tokens": 4,
            "latency_ms": 1022.4,
            "usd": 0.00129,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 410,
            "output_tokens": 4,
            "latency_ms": 1036.7,
            "usd": 0.00129,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 410.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 992.633,
        "usd_sum": 0.00387
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-005",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "441705237fe65e54",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below. Respond with just the candidate id.\n\nINPUT:\n{\n  \"dish\": \"warm dark chocolate fondant (70% cacao), molten centre, vanilla ice cream\",\n  \"candidates\": {\n    \"a\": \"dry Sancerre Sauvignon Blanc\",\n    \"b\": \"vintage Port\",\n    \"c\": \"bone-dry Brut Nature Champagne\",\n    \"d\": \"Muscadet Sevre-et-Maine\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "b",
        "ranking": [
          "b",
          "c",
          "a",
          "d"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 412,
            "output_tokens": 4,
            "latency_ms": 968.3,
            "usd": 0.001296,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 412,
            "output_tokens": 4,
            "latency_ms": 1012.4,
            "usd": 0.001296,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "b",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "b",
            "expected_best": "b",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 412,
            "output_tokens": 4,
            "latency_ms": 1480.8,
            "usd": 0.001296,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 412.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1153.833,
        "usd_sum": 0.003888
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-006",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "b195ee1dfe7968ab",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage to pair across a premium edomae sushi omakase. Respond with just the candidate id.\n\nINPUT:\n{\n  \"dish\": \"premium edomae sushi omakase (akami, toro, kohada, ikura, uni, anago)\",\n  \"candidates\": {\n    \"a\": \"Junmai Daiginjo sake (well-chilled)\",\n    \"b\": \"heavily oaked California Chardonnay\",\n    \"c\": \"Barolo Riserva\",\n    \"d\": \"Islay single malt\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "b",
          "d",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 431,
            "output_tokens": 4,
            "latency_ms": 905.2,
            "usd": 0.001353,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 431,
            "output_tokens": 4,
            "latency_ms": 1237.2,
            "usd": 0.001353,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 431,
            "output_tokens": 4,
            "latency_ms": 1191.0,
            "usd": 0.001353,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 431.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1111.133,
        "usd_sum": 0.004059
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-007",
      "category": "pairing",
      "capability": "regional_pairing",
      "difficulty": "hard",
      "cuisine_region": "spanish",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "81bf14f26f4fa761",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for a classic paella valenciana (rabbit, chicken, garrofó beans, saffron). Respond with just the candidate id.\n\nINPUT:\n{\n  \"dish\": \"authentic paella valenciana\",\n  \"candidates\": {\n    \"a\": \"young Rioja crianza (Tempranillo blend, light oak)\",\n    \"b\": \"Amarone della Valpolicella Riserva\",\n    \"c\": \"Islay peated single malt\",\n    \"d\": \"off-dry Moscato d'Asti\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "b",
          "d",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 428,
            "output_tokens": 4,
            "latency_ms": 693.6,
            "usd": 0.001344,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 428,
            "output_tokens": 4,
            "latency_ms": 1009.4,
            "usd": 0.001344,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 428,
            "output_tokens": 4,
            "latency_ms": 719.7,
            "usd": 0.001344,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 428.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 807.567,
        "usd_sum": 0.004032
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-008",
      "category": "pairing",
      "capability": "pairing_grade_reasoning",
      "difficulty": "hard",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.grade_scale"
      ],
      "prompt_template_hash": "51ad14586a063779",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n---\n\nA pairing's overall_score is computed as 68. Using the OGS grade scale, return the letter grade (ogs-match.md §5). Respond with just the grade token.\n\nINPUT:\n{\n  \"overall_score\": 68\n}\n\nANSWER FORMAT: one_of:A+,A,B+,B,C+,C,D,F",
      "expected": {
        "value": "D"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "D",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "D",
            "expected": "D",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 429,
            "output_tokens": 4,
            "latency_ms": 1292.9,
            "usd": 0.001347,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "D",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "D",
            "expected": "D",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 429,
            "output_tokens": 4,
            "latency_ms": 1132.3,
            "usd": 0.001347,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "D",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "D",
            "expected": "D",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 429,
            "output_tokens": 4,
            "latency_ms": 2045.9,
            "usd": 0.001347,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 429.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1490.367,
        "usd_sum": 0.004041
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-009",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "easy",
      "cuisine_region": "italian",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "d7ade64ce48bfbe4",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below from the candidates. Respond with just the candidate id (e.g. 'a').\n\nINPUT:\n{\n  \"dish\": \"classic Neapolitan pizza Margherita (tomato, buffalo mozzarella, basil)\",\n  \"candidates\": {\n    \"a\": \"bright Chianti Classico (Sangiovese, high acid, medium tannin)\",\n    \"b\": \"heavily oaked California Chardonnay\",\n    \"c\": \"sweet ice wine\",\n    \"d\": \"Islay peated whisky\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "b",
          "d",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 433,
            "output_tokens": 4,
            "latency_ms": 3067.5,
            "usd": 0.001359,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 433,
            "output_tokens": 4,
            "latency_ms": 1183.0,
            "usd": 0.001359,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 433,
            "output_tokens": 4,
            "latency_ms": 1436.1,
            "usd": 0.001359,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 433.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1895.533,
        "usd_sum": 0.004077
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-010",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "chinese",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "637717fdf3577204",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below.\n\nINPUT:\n{\n  \"dish\": \"mapo tofu with doubanjiang and Sichuan peppercorn, medium-hot\",\n  \"candidates\": {\n    \"a\": \"off-dry German Riesling Spätlese\",\n    \"b\": \"high-alcohol (15%) Zinfandel\",\n    \"c\": \"bone-dry Fino sherry\",\n    \"d\": \"straight rye whiskey\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "c",
          "d",
          "b"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 409,
            "output_tokens": 4,
            "latency_ms": 914.8,
            "usd": 0.001287,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 409,
            "output_tokens": 4,
            "latency_ms": 1044.7,
            "usd": 0.001287,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 409,
            "output_tokens": 4,
            "latency_ms": 4074.2,
            "usd": 0.001287,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 409.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 2011.233,
        "usd_sum": 0.003861
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-011",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "indian",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "2b655f8b1ea64df8",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish below.\n\nINPUT:\n{\n  \"dish\": \"butter chicken (murgh makhani) in rich tomato-cream sauce with fenugreek\",\n  \"candidates\": {\n    \"a\": \"off-dry Gewurztraminer\",\n    \"b\": \"young tannic Cabernet Sauvignon\",\n    \"c\": \"unoaked Muscadet\",\n    \"d\": \"Islay single malt\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "c",
          "b",
          "d"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 405,
            "output_tokens": 4,
            "latency_ms": 1913.2,
            "usd": 0.001275,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 405,
            "output_tokens": 4,
            "latency_ms": 1129.3,
            "usd": 0.001275,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 405,
            "output_tokens": 4,
            "latency_ms": 924.4,
            "usd": 0.001275,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 405.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1322.3,
        "usd_sum": 0.003825
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-012",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "ethiopian",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "3fe344c62017694c",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for the dish.\n\nINPUT:\n{\n  \"dish\": \"doro wat: chicken stewed in berbere-spiced ghee, served with injera\",\n  \"candidates\": {\n    \"a\": \"juicy, low-tannin Beaujolais (Gamay) served slightly chilled\",\n    \"b\": \"high-alcohol Amarone\",\n    \"c\": \"bone-dry sparkling Prosecco Brut\",\n    \"d\": \"heavily peated Islay whisky\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "c",
          "b",
          "d"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 419,
            "output_tokens": 4,
            "latency_ms": 773.5,
            "usd": 0.001317,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 419,
            "output_tokens": 4,
            "latency_ms": 1069.8,
            "usd": 0.001317,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 419,
            "output_tokens": 4,
            "latency_ms": 929.4,
            "usd": 0.001317,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 419.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 924.233,
        "usd_sum": 0.003951
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-013",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "argentinian",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "4d30c6e439c9c85f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage.\n\nINPUT:\n{\n  \"dish\": \"grilled ribeye over wood embers, served with chimichurri\",\n  \"candidates\": {\n    \"a\": \"old-vine Mendoza Malbec, medium-plus tannin, black fruit\",\n    \"b\": \"unoaked Chablis\",\n    \"c\": \"off-dry Moscato\",\n    \"d\": \"dry Manzanilla sherry\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "d",
          "b",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 404,
            "output_tokens": 4,
            "latency_ms": 1272.5,
            "usd": 0.001272,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 404,
            "output_tokens": 4,
            "latency_ms": 2162.7,
            "usd": 0.001272,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 404,
            "output_tokens": 4,
            "latency_ms": 1385.8,
            "usd": 0.001272,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 404.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1607.0,
        "usd_sum": 0.003816
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-014",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "hard",
      "cuisine_region": "japanese",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "848f37f91777acfb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for otoro nigiri topped with a brush of soy.\n\nINPUT:\n{\n  \"dish\": \"bluefin tuna otoro nigiri, brushed soy\",\n  \"candidates\": {\n    \"a\": \"junmai daiginjo sake, chilled\",\n    \"b\": \"young Chinon Cabernet Franc\",\n    \"c\": \"dry Riesling Kabinett\",\n    \"d\": \"sweet Sauternes\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "c",
          "b",
          "d"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 407,
            "output_tokens": 4,
            "latency_ms": 1085.0,
            "usd": 0.001281,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 407,
            "output_tokens": 4,
            "latency_ms": 990.4,
            "usd": 0.001281,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 407,
            "output_tokens": 4,
            "latency_ms": 1271.1,
            "usd": 0.001281,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 407.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1115.5,
        "usd_sum": 0.003843
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-015",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "hard",
      "cuisine_region": "moroccan",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "f5ecf9dcd59f0d0a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage.\n\nINPUT:\n{\n  \"dish\": \"slow-cooked lamb shoulder tagine with preserved lemon and green olives\",\n  \"candidates\": {\n    \"a\": \"Southern Rhône Grenache-Syrah blend (Gigondas), moderate tannin\",\n    \"b\": \"unoaked Muscadet\",\n    \"c\": \"Champagne Blanc de Blancs\",\n    \"d\": \"young high-tannin Barolo\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "d",
          "c",
          "b"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 411,
            "output_tokens": 4,
            "latency_ms": 1721.5,
            "usd": 0.001293,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 411,
            "output_tokens": 4,
            "latency_ms": 1344.7,
            "usd": 0.001293,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 411,
            "output_tokens": 4,
            "latency_ms": 7793.3,
            "usd": 0.001293,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 411.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 3619.833,
        "usd_sum": 0.003879
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-016",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "vietnamese",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "3dd7c0170877ac9b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for a bowl of Vietnamese beef pho.\n\nINPUT:\n{\n  \"dish\": \"pho bo: beef broth with star anise, cinnamon, rice noodles, rare beef, basil, lime\",\n  \"candidates\": {\n    \"a\": \"cold Vietnamese lager-style beer (e.g. Saigon or 333)\",\n    \"b\": \"tannic Cabernet Sauvignon\",\n    \"c\": \"heavily oaked Chardonnay\",\n    \"d\": \"sweet Tokaji Aszú 5 puttonyos\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "b",
          "c",
          "d"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 435,
            "output_tokens": 4,
            "latency_ms": 741.6,
            "usd": 0.001365,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 435,
            "output_tokens": 4,
            "latency_ms": 675.2,
            "usd": 0.001365,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 435,
            "output_tokens": 4,
            "latency_ms": 1208.7,
            "usd": 0.001365,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 435.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 875.167,
        "usd_sum": 0.004095
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-017",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "8f60336584f33baf",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for 24-month aged Comté served on its own.\n\nINPUT:\n{\n  \"dish\": \"24-month aged Comté, served at room temperature\",\n  \"candidates\": {\n    \"a\": \"Jura Vin Jaune (Savagnin, oxidative, nutty)\",\n    \"b\": \"young oaky Cabernet Sauvignon\",\n    \"c\": \"dry Muscadet\",\n    \"d\": \"Sauternes\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "d",
          "b",
          "c"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 412,
            "output_tokens": 4,
            "latency_ms": 828.2,
            "usd": 0.001296,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 412,
            "output_tokens": 4,
            "latency_ms": 864.5,
            "usd": 0.001296,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 412,
            "output_tokens": 4,
            "latency_ms": 1401.8,
            "usd": 0.001296,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 412.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1031.5,
        "usd_sum": 0.003888
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-018",
      "category": "pairing",
      "capability": "regional_pairing",
      "difficulty": "medium",
      "cuisine_region": "thai",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "9209873bc05779b7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage for a fragrant, moderately spicy Thai green curry with chicken and Thai basil.\n\nINPUT:\n{\n  \"dish\": \"Thai green curry with chicken, Thai basil, kaffir lime\",\n  \"candidates\": {\n    \"a\": \"off-dry New Zealand Sauvignon Blanc with residual sugar\",\n    \"b\": \"heavy Port wine\",\n    \"c\": \"young tannic Bordeaux\",\n    \"d\": \"Islay peated whisky\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "c",
          "b",
          "d"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 411,
            "output_tokens": 4,
            "latency_ms": 758.8,
            "usd": 0.001293,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 411,
            "output_tokens": 4,
            "latency_ms": 721.8,
            "usd": 0.001293,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 411,
            "output_tokens": 4,
            "latency_ms": 933.5,
            "usd": 0.001293,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 411.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 804.7,
        "usd_sum": 0.003879
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-019",
      "category": "pairing",
      "capability": "best_beverage_selection",
      "difficulty": "medium",
      "cuisine_region": "mexican",
      "scorer": "ranked_selection",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "da735293a9d3102b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nPick the single best beverage.\n\nINPUT:\n{\n  \"dish\": \"Oaxacan mole negro enchiladas (rich chocolate-chile-spice sauce over chicken)\",\n  \"candidates\": {\n    \"a\": \"medium-tannin Garnacha from Aragón\",\n    \"b\": \"unoaked Muscadet\",\n    \"c\": \"very sweet late-harvest Riesling\",\n    \"d\": \"bone-dry Brut Champagne\"\n  }\n}\n\nANSWER FORMAT: one_of:a,b,c,d",
      "expected": {
        "value": "a",
        "ranking": [
          "a",
          "d",
          "c",
          "b"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 4,
            "latency_ms": 892.0,
            "usd": 0.001284,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 4,
            "latency_ms": 1102.3,
            "usd": 0.001284,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "a",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a",
            "expected_best": "a",
            "rank": 0
          },
          "metrics": {
            "input_tokens": 408,
            "output_tokens": 4,
            "latency_ms": 727.6,
            "usd": 0.001284,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 408.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 907.3,
        "usd_sum": 0.003852
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:pair-020",
      "category": "pairing",
      "capability": "pairing_grade_reasoning",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.grade_scale"
      ],
      "prompt_template_hash": "bac251ebb1c064ef",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n---\n\nA pairing's overall_score is computed as 96. Using the OGS grade scale, return the letter grade.\n\nINPUT:\n{\n  \"overall_score\": 96\n}\n\nANSWER FORMAT: one_of:A+,A,B+,B,C+,C,D,F",
      "expected": {
        "value": "A+"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "A+",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a+",
            "expected": "a+",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 412,
            "output_tokens": 5,
            "latency_ms": 1377.4,
            "usd": 0.001311,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "A+",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a+",
            "expected": "a+",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 412,
            "output_tokens": 5,
            "latency_ms": 1397.5,
            "usd": 0.001311,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "A+",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "a+",
            "expected": "a+",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 412,
            "output_tokens": 5,
            "latency_ms": 1253.6,
            "usd": 0.001311,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 412.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1342.833,
        "usd_sum": 0.003933
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-001",
      "category": "regional",
      "capability": "wine_region_knowledge",
      "difficulty": "easy",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "1df867dd3a55af5b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhat is the ONLY grape variety permitted in Barolo DOCG? Respond with just the variety name.\n\nANSWER FORMAT: string",
      "expected": {
        "value": "nebbiolo"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Nebbiolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "nebbiolo",
            "expected": "nebbiolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 278,
            "output_tokens": 7,
            "latency_ms": 1525.6,
            "usd": 0.000939,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Nebbiolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "nebbiolo",
            "expected": "nebbiolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 278,
            "output_tokens": 7,
            "latency_ms": 1343.7,
            "usd": 0.000939,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Nebbiolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "nebbiolo",
            "expected": "nebbiolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 278,
            "output_tokens": 7,
            "latency_ms": 742.4,
            "usd": 0.000939,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 278.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1203.9,
        "usd_sum": 0.002817
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-002",
      "category": "regional",
      "capability": "wine_region_knowledge",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "24a95f8a1c0a0a45",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhat single grape variety is used for Chablis AOC? Respond with just the variety name.\n\nANSWER FORMAT: string",
      "expected": {
        "value": "chardonnay"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Chardonnay",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "chardonnay",
            "expected": "chardonnay",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 276,
            "output_tokens": 8,
            "latency_ms": 778.8,
            "usd": 0.000948,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Chardonnay",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "chardonnay",
            "expected": "chardonnay",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 276,
            "output_tokens": 8,
            "latency_ms": 1039.7,
            "usd": 0.000948,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Chardonnay",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "chardonnay",
            "expected": "chardonnay",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 276,
            "output_tokens": 8,
            "latency_ms": 801.8,
            "usd": 0.000948,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 276.0,
        "output_tokens_mean": 8.0,
        "latency_ms_mean": 873.433,
        "usd_sum": 0.002844
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-003",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "easy",
      "cuisine_region": "vietnamese",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "15d36148f8b853e6",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhich country is phở originally from? Respond with just the country name.\n\nANSWER FORMAT: string",
      "expected": {
        "value": "vietnam"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Vietnam",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "vietnam",
            "expected": "vietnam",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 270,
            "output_tokens": 4,
            "latency_ms": 799.4,
            "usd": 0.00087,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Vietnam",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "vietnam",
            "expected": "vietnam",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 270,
            "output_tokens": 4,
            "latency_ms": 878.7,
            "usd": 0.00087,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Vietnam",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "vietnam",
            "expected": "vietnam",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 270,
            "output_tokens": 4,
            "latency_ms": 970.8,
            "usd": 0.00087,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 270.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 882.967,
        "usd_sum": 0.00261
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-004",
      "category": "regional",
      "capability": "classic_regional_pairing",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "e332af036a2bd591",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nName the classic Alsace wine traditionally paired with ripe, stinky Muenster cheese. One word, grape variety only.\n\nANSWER FORMAT: string",
      "expected": {
        "value": "gewurztraminer"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Gewurztraminer",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "gewurztraminer",
            "expected": "gewurztraminer",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 283,
            "output_tokens": 9,
            "latency_ms": 1171.3,
            "usd": 0.000984,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Gewurztraminer",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "gewurztraminer",
            "expected": "gewurztraminer",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 283,
            "output_tokens": 9,
            "latency_ms": 1420.9,
            "usd": 0.000984,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Gewurztraminer",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "gewurztraminer",
            "expected": "gewurztraminer",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 283,
            "output_tokens": 9,
            "latency_ms": 957.6,
            "usd": 0.000984,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 283.0,
        "output_tokens_mean": 9.0,
        "latency_ms_mean": 1183.267,
        "usd_sum": 0.002952
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-005",
      "category": "regional",
      "capability": "french_classical_cuisine",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "4aa2c63ffdeeacfb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nName the French mother sauce that is made from a brown roux, brown stock and mirepoix, traditionally enriched with tomato. One word, lowercase.\n\nANSWER FORMAT: string",
      "expected": {
        "value": "espagnole"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "espagnole",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "espagnole",
            "expected": "espagnole",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 288,
            "output_tokens": 6,
            "latency_ms": 799.1,
            "usd": 0.000954,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "espagnole",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "espagnole",
            "expected": "espagnole",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 288,
            "output_tokens": 6,
            "latency_ms": 981.5,
            "usd": 0.000954,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "espagnole",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "espagnole",
            "expected": "espagnole",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 288,
            "output_tokens": 6,
            "latency_ms": 712.1,
            "usd": 0.000954,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 288.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 830.9,
        "usd_sum": 0.002862
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-006",
      "category": "regional",
      "capability": "wine_region_knowledge",
      "difficulty": "medium",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "979e7579b34d9797",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhat is the name of the official Rioja classification for a red wine aged at least 2 years (1 in oak) and 3+ years total? Respond with just the term.\n\nINPUT:\n{\n  \"question\": \"official Rioja classification for red wine aged 2+ years (1+ in oak), 3+ total\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "reserva"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Crianza",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "crianza",
            "expected": "reserva",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 335,
            "output_tokens": 6,
            "latency_ms": 778.9,
            "usd": 0.001095,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Crianza",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "crianza",
            "expected": "reserva",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 335,
            "output_tokens": 6,
            "latency_ms": 1395.3,
            "usd": 0.001095,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Crianza",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "crianza",
            "expected": "reserva",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 335,
            "output_tokens": 6,
            "latency_ms": 882.1,
            "usd": 0.001095,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 335.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 1018.767,
        "usd_sum": 0.003285
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-007",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "easy",
      "cuisine_region": "korean",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "23a69016b1082553",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nFrom which country does kimchi originate? Respond with just the country name in English.\n\nINPUT:\n{\n  \"dish\": \"kimchi\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "korea",
        "aliases": [
          "south korea",
          "republic of korea"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Korea",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "korea",
            "expected": "korea",
            "aliases": [
              "south korea",
              "republic of korea"
            ]
          },
          "metrics": {
            "input_tokens": 291,
            "output_tokens": 4,
            "latency_ms": 657.6,
            "usd": 0.000933,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Korea",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "korea",
            "expected": "korea",
            "aliases": [
              "south korea",
              "republic of korea"
            ]
          },
          "metrics": {
            "input_tokens": 291,
            "output_tokens": 4,
            "latency_ms": 768.5,
            "usd": 0.000933,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Korea",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "korea",
            "expected": "korea",
            "aliases": [
              "south korea",
              "republic of korea"
            ]
          },
          "metrics": {
            "input_tokens": 291,
            "output_tokens": 4,
            "latency_ms": 870.8,
            "usd": 0.000933,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 291.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 765.633,
        "usd_sum": 0.002799
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-008",
      "category": "regional",
      "capability": "wine_region_knowledge",
      "difficulty": "hard",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "0a12eb057eb17654",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhat is the single grape variety required for Barbaresco DOCG?\n\nINPUT:\n{\n  \"wine\": \"Barbaresco DOCG\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "nebbiolo"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Nebbiolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "nebbiolo",
            "expected": "nebbiolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 291,
            "output_tokens": 7,
            "latency_ms": 1007.1,
            "usd": 0.000978,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Nebbiolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "nebbiolo",
            "expected": "nebbiolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 291,
            "output_tokens": 7,
            "latency_ms": 1252.1,
            "usd": 0.000978,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Nebbiolo",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "nebbiolo",
            "expected": "nebbiolo",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 291,
            "output_tokens": 7,
            "latency_ms": 1139.1,
            "usd": 0.000978,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 291.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1132.767,
        "usd_sum": 0.002934
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-009",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "medium",
      "cuisine_region": "peruvian",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "767d178c3cce3333",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhich country is classic ceviche most commonly attributed to? Respond with the country name.\n\nINPUT:\n{\n  \"dish\": \"ceviche\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "peru"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Peru",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "peru",
            "expected": "peru",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 292,
            "output_tokens": 4,
            "latency_ms": 814.8,
            "usd": 0.000936,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Peru",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "peru",
            "expected": "peru",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 292,
            "output_tokens": 4,
            "latency_ms": 915.9,
            "usd": 0.000936,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Peru",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "peru",
            "expected": "peru",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 292,
            "output_tokens": 4,
            "latency_ms": 987.2,
            "usd": 0.000936,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 292.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 905.967,
        "usd_sum": 0.002808
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-010",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "easy",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "1001169bc92aaacd",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nIn which country does paella valenciana originate? Respond with the country name.\n\nINPUT:\n{\n  \"dish\": \"paella valenciana\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "spain"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Spain",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "spain",
            "expected": "spain",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 292,
            "output_tokens": 4,
            "latency_ms": 1717.4,
            "usd": 0.000936,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Spain",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "spain",
            "expected": "spain",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 292,
            "output_tokens": 4,
            "latency_ms": 600.3,
            "usd": 0.000936,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Spain",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "spain",
            "expected": "spain",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 292,
            "output_tokens": 4,
            "latency_ms": 846.7,
            "usd": 0.000936,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 292.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1054.8,
        "usd_sum": 0.002808
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-011",
      "category": "regional",
      "capability": "classic_regional_pairing",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "571e5074e66e6a11",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nName the classic French white wine region traditionally paired with raw oysters. Respond with just the region name.\n\nINPUT:\n{\n  \"dish\": \"raw oysters on the half shell\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "chablis",
        "aliases": [
          "muscadet"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Muscadet",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "muscadet",
            "expected": "chablis",
            "aliases": [
              "muscadet"
            ]
          },
          "metrics": {
            "input_tokens": 299,
            "output_tokens": 7,
            "latency_ms": 1815.0,
            "usd": 0.001002,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Muscadet",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "muscadet",
            "expected": "chablis",
            "aliases": [
              "muscadet"
            ]
          },
          "metrics": {
            "input_tokens": 299,
            "output_tokens": 7,
            "latency_ms": 1139.0,
            "usd": 0.001002,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Muscadet",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "muscadet",
            "expected": "chablis",
            "aliases": [
              "muscadet"
            ]
          },
          "metrics": {
            "input_tokens": 299,
            "output_tokens": 7,
            "latency_ms": 1497.6,
            "usd": 0.001002,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 299.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1483.867,
        "usd_sum": 0.003006
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-012",
      "category": "regional",
      "capability": "french_classical_cuisine",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "d9792be4d5ca7335",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nName the French mother sauce made from clarified butter, egg yolks, and lemon juice.\n\nINPUT:\n{\n  \"description\": \"emulsion of clarified butter, egg yolks, and lemon juice\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "hollandaise"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Hollandaise",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "hollandaise",
            "expected": "hollandaise",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 307,
            "output_tokens": 7,
            "latency_ms": 911.4,
            "usd": 0.001026,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Hollandaise",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "hollandaise",
            "expected": "hollandaise",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 307,
            "output_tokens": 7,
            "latency_ms": 2484.5,
            "usd": 0.001026,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Hollandaise",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "hollandaise",
            "expected": "hollandaise",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 307,
            "output_tokens": 7,
            "latency_ms": 872.8,
            "usd": 0.001026,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 307.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1422.9,
        "usd_sum": 0.003078
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-013",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "c831a091be5025f9",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nIn which country did edomae-style sushi originate?\n\nINPUT:\n{\n  \"dish\": \"edomae sushi\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "japan"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Japan",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "japan",
            "expected": "japan",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 289,
            "output_tokens": 4,
            "latency_ms": 1095.8,
            "usd": 0.000927,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Japan",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "japan",
            "expected": "japan",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 289,
            "output_tokens": 4,
            "latency_ms": 1602.4,
            "usd": 0.000927,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Japan",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "japan",
            "expected": "japan",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 289,
            "output_tokens": 4,
            "latency_ms": 1250.0,
            "usd": 0.000927,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 289.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1316.067,
        "usd_sum": 0.002781
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-014",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "medium",
      "cuisine_region": "moroccan",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "7d761cbfb86342af",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nIn which country does the slow-cooked dish tagine originate?\n\nINPUT:\n{\n  \"dish\": \"tagine (clay-pot slow-cooked stew)\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "morocco"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Morocco",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "morocco",
            "expected": "morocco",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 297,
            "output_tokens": 4,
            "latency_ms": 1113.2,
            "usd": 0.000951,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Morocco",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "morocco",
            "expected": "morocco",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 297,
            "output_tokens": 4,
            "latency_ms": 1156.8,
            "usd": 0.000951,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Morocco",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "morocco",
            "expected": "morocco",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 297,
            "output_tokens": 4,
            "latency_ms": 1304.2,
            "usd": 0.000951,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 297.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1191.4,
        "usd_sum": 0.002853
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-015",
      "category": "regional",
      "capability": "wine_region_knowledge",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "2f35d24bb9d58118",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhat single grape variety is used for white Sancerre AOC?\n\nINPUT:\n{\n  \"wine\": \"Sancerre AOC blanc\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "sauvignon blanc",
        "aliases": [
          "sauvignon_blanc",
          "sauvignon"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Sauvignon",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sauvignon",
            "expected": "sauvignon blanc",
            "aliases": [
              "sauvignon_blanc",
              "sauvignon"
            ]
          },
          "metrics": {
            "input_tokens": 292,
            "output_tokens": 7,
            "latency_ms": 1180.9,
            "usd": 0.000981,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Sauvignon",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sauvignon",
            "expected": "sauvignon blanc",
            "aliases": [
              "sauvignon_blanc",
              "sauvignon"
            ]
          },
          "metrics": {
            "input_tokens": 292,
            "output_tokens": 7,
            "latency_ms": 955.9,
            "usd": 0.000981,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Sauvignon",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sauvignon",
            "expected": "sauvignon blanc",
            "aliases": [
              "sauvignon_blanc",
              "sauvignon"
            ]
          },
          "metrics": {
            "input_tokens": 292,
            "output_tokens": 7,
            "latency_ms": 1342.8,
            "usd": 0.000981,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 292.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1159.867,
        "usd_sum": 0.002943
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-016",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "easy",
      "cuisine_region": "mexican",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "efb489c56faf463a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nIn which country does mole poblano originate?\n\nINPUT:\n{\n  \"dish\": \"mole poblano\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "mexico"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Mexico",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "mexico",
            "expected": "mexico",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 285,
            "output_tokens": 4,
            "latency_ms": 753.9,
            "usd": 0.000915,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Mexico",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "mexico",
            "expected": "mexico",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 285,
            "output_tokens": 4,
            "latency_ms": 875.4,
            "usd": 0.000915,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Mexico",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "mexico",
            "expected": "mexico",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 285,
            "output_tokens": 4,
            "latency_ms": 811.8,
            "usd": 0.000915,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 285.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 813.7,
        "usd_sum": 0.002745
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-017",
      "category": "regional",
      "capability": "classic_regional_pairing",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "53530e4634162afc",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nName the Alsatian white wine traditionally served with choucroute garnie.\n\nINPUT:\n{\n  \"dish\": \"choucroute garnie\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "riesling"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Riesling",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "riesling",
            "expected": "riesling",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 294,
            "output_tokens": 7,
            "latency_ms": 885.3,
            "usd": 0.000987,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Riesling",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "riesling",
            "expected": "riesling",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 294,
            "output_tokens": 7,
            "latency_ms": 951.1,
            "usd": 0.000987,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Riesling",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "riesling",
            "expected": "riesling",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 294,
            "output_tokens": 7,
            "latency_ms": 934.9,
            "usd": 0.000987,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 294.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 923.767,
        "usd_sum": 0.002961
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-018",
      "category": "regional",
      "capability": "dish_origin",
      "difficulty": "medium",
      "cuisine_region": "brazilian",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "28a636bdc82633de",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nWhich country claims feijoada completa as a national dish?\n\nINPUT:\n{\n  \"dish\": \"feijoada completa\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "brazil"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Brazil",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "brazil",
            "expected": "brazil",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 290,
            "output_tokens": 4,
            "latency_ms": 1237.8,
            "usd": 0.00093,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Brazil",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "brazil",
            "expected": "brazil",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 290,
            "output_tokens": 4,
            "latency_ms": 859.9,
            "usd": 0.00093,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Brazil",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "brazil",
            "expected": "brazil",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 290,
            "output_tokens": 4,
            "latency_ms": 1017.9,
            "usd": 0.00093,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 290.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1038.533,
        "usd_sum": 0.00279
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-019",
      "category": "regional",
      "capability": "wine_region_knowledge",
      "difficulty": "hard",
      "cuisine_region": "hungarian",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "2c5869ce479f65cc",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nTokaji Aszú's vineyards are centred in a single country. Name that country in English.\n\nINPUT:\n{\n  \"wine\": \"Tokaji Aszú\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "hungary"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Hungary",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "hungary",
            "expected": "hungary",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 302,
            "output_tokens": 4,
            "latency_ms": 1722.4,
            "usd": 0.000966,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Hungary",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "hungary",
            "expected": "hungary",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 302,
            "output_tokens": 4,
            "latency_ms": 2365.2,
            "usd": 0.000966,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Hungary",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "hungary",
            "expected": "hungary",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 302,
            "output_tokens": 4,
            "latency_ms": 957.3,
            "usd": 0.000966,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 302.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1681.633,
        "usd_sum": 0.002898
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:region-020",
      "category": "regional",
      "capability": "classic_regional_pairing",
      "difficulty": "hard",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [],
      "prompt_template_hash": "aa3761bdbf2d490e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n---\n\nName the Galician white wine grape traditionally paired with pulpo á feira (boiled octopus with pimentón).\n\nINPUT:\n{\n  \"dish\": \"pulpo á feira — Galician boiled octopus with pimentón and olive oil\"\n}\n\nANSWER FORMAT: one_word",
      "expected": {
        "value": "albarino",
        "aliases": [
          "albariño"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "Albariño",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "albariño",
            "expected": "albarino",
            "aliases": [
              "albariño"
            ]
          },
          "metrics": {
            "input_tokens": 316,
            "output_tokens": 7,
            "latency_ms": 1220.6,
            "usd": 0.001053,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "Albariño",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "albariño",
            "expected": "albarino",
            "aliases": [
              "albariño"
            ]
          },
          "metrics": {
            "input_tokens": 316,
            "output_tokens": 7,
            "latency_ms": 1341.0,
            "usd": 0.001053,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "Albariño",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "albariño",
            "expected": "albarino",
            "aliases": [
              "albariño"
            ]
          },
          "metrics": {
            "input_tokens": 316,
            "output_tokens": 7,
            "latency_ms": 1156.2,
            "usd": 0.001053,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 316.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1239.267,
        "usd_sum": 0.003159
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-001",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "08845fcf7f8beb09",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"salmon and tuna nigiri\",\n  \"beverage\": \"young Bordeaux Left Bank (Cabernet Sauvignon, high tannin)\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_TANNIN_FISH"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_TANNIN_FISH",
            "expected": "RISK_TANNIN_FISH",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 788,
            "output_tokens": 13,
            "latency_ms": 1070.9,
            "usd": 0.002559,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_TANNIN_FISH",
            "expected": "RISK_TANNIN_FISH",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 788,
            "output_tokens": 13,
            "latency_ms": 1317.4,
            "usd": 0.002559,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_TANNIN_FISH",
            "expected": "RISK_TANNIN_FISH",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 788,
            "output_tokens": 13,
            "latency_ms": 1135.8,
            "usd": 0.002559,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 788.0,
        "output_tokens_mean": 13.0,
        "latency_ms_mean": 1174.7,
        "usd_sum": 0.007677
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-002",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "7a188b424515258f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"Thai green curry with bird's eye chilli, high heat\",\n  \"beverage\": \"15.5% ABV California Zinfandel\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_SPICE_ALCOHOL"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_SPICE_ALCOHOL",
            "expected": "RISK_SPICE_ALCOHOL",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 786,
            "output_tokens": 12,
            "latency_ms": 713.0,
            "usd": 0.002538,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_SPICE_ALCOHOL",
            "expected": "RISK_SPICE_ALCOHOL",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 786,
            "output_tokens": 12,
            "latency_ms": 956.9,
            "usd": 0.002538,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_SPICE_ALCOHOL",
            "expected": "RISK_SPICE_ALCOHOL",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 786,
            "output_tokens": 12,
            "latency_ms": 1262.5,
            "usd": 0.002538,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 786.0,
        "output_tokens_mean": 12.0,
        "latency_ms_mean": 977.467,
        "usd_sum": 0.007614
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-003",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "a60fa6dd5c1ff452",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code for the pairing. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"delicate flounder sashimi with ponzu\",\n  \"beverage\": \"heavily new-oaked California Chardonnay, 14% ABV, buttery ML\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_OAK_MASKS"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_OAK_MASKS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_OAK_MASKS",
            "expected": "RISK_OAK_MASKS",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 790,
            "output_tokens": 12,
            "latency_ms": 775.8,
            "usd": 0.00255,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_OAK_MASKS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_OAK_MASKS",
            "expected": "RISK_OAK_MASKS",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 790,
            "output_tokens": 12,
            "latency_ms": 1003.5,
            "usd": 0.00255,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_OAK_MASKS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_OAK_MASKS",
            "expected": "RISK_OAK_MASKS",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 790,
            "output_tokens": 12,
            "latency_ms": 908.6,
            "usd": 0.00255,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 790.0,
        "output_tokens_mean": 12.0,
        "latency_ms_mean": 895.967,
        "usd_sum": 0.00765
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-004",
      "category": "risk",
      "capability": "risk_severity",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "16286de7fea14ae1",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nUsing OGS risk severity levels (low/medium/high), classify the RISK_ALCOHOL_OVERPOWERS severity for the pairing below. Respond with just the severity token.\n\nINPUT:\n{\n  \"dish\": \"pan-seared halibut with beurre blanc, very delicate flavours\",\n  \"beverage\": \"15.5% ABV Amarone della Valpolicella\"\n}\n\nANSWER FORMAT: one_of:low,medium,high",
      "expected": {
        "value": "high"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "high",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "high",
            "expected": "high",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 732,
            "output_tokens": 4,
            "latency_ms": 1047.8,
            "usd": 0.002256,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "high",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "high",
            "expected": "high",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 732,
            "output_tokens": 4,
            "latency_ms": 675.8,
            "usd": 0.002256,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "high",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "high",
            "expected": "high",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 732,
            "output_tokens": 4,
            "latency_ms": 815.4,
            "usd": 0.002256,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 732.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 846.333,
        "usd_sum": 0.006768
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-005",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "cd44f8ac361108e8",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code for the pairing. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"caramel panna cotta with honey\",\n  \"beverage\": \"bone-dry Brut Nature Champagne (zero dosage)\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_SWEETNESS_CONFLICT"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_SWEETNESS_CONFLICT",
            "expected": "RISK_SWEETNESS_CONFLICT",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 782,
            "output_tokens": 12,
            "latency_ms": 1196.0,
            "usd": 0.002526,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_SWEETNESS_CONFLICT",
            "expected": "RISK_SWEETNESS_CONFLICT",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 782,
            "output_tokens": 12,
            "latency_ms": 1116.5,
            "usd": 0.002526,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "RISK_SWEETNESS_CONFLICT",
            "expected": "RISK_SWEETNESS_CONFLICT",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 782,
            "output_tokens": 12,
            "latency_ms": 2337.2,
            "usd": 0.002526,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 782.0,
        "output_tokens_mean": 12.0,
        "latency_ms_mean": 1549.9,
        "usd_sum": 0.007578
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-006",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "easy",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "28cb271326a709c7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"raw tuna sashimi with a light soy-wasabi dip\",\n  \"beverage\": \"young tannic Napa Cabernet Sauvignon\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_TANNIN_FISH"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_tannin_fish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 792,
            "output_tokens": 13,
            "latency_ms": 740.0,
            "usd": 0.002571,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_tannin_fish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 792,
            "output_tokens": 13,
            "latency_ms": 839.8,
            "usd": 0.002571,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_tannin_fish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 792,
            "output_tokens": 13,
            "latency_ms": 792.1,
            "usd": 0.002571,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 792.0,
        "output_tokens_mean": 13.0,
        "latency_ms_mean": 790.633,
        "usd_sum": 0.007713
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-007",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "thai",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "93c46eac23eca249",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"spicy Thai green curry with chilies and coconut milk\",\n  \"beverage\": \"15.5% abv California Zinfandel\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_SPICE_ALCOHOL"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_spice_alcohol",
            "expected": "risk_spice_alcohol",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 787,
            "output_tokens": 12,
            "latency_ms": 717.0,
            "usd": 0.002541,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_spice_alcohol",
            "expected": "risk_spice_alcohol",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 787,
            "output_tokens": 12,
            "latency_ms": 889.6,
            "usd": 0.002541,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_spice_alcohol",
            "expected": "risk_spice_alcohol",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 787,
            "output_tokens": 12,
            "latency_ms": 773.8,
            "usd": 0.002541,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 787.0,
        "output_tokens_mean": 12.0,
        "latency_ms_mean": 793.467,
        "usd_sum": 0.007623
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-008",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "2409e606629eebf9",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"grilled escarole salad with bitter radicchio and walnuts\",\n  \"beverage\": \"bone-dry bitter amontillado sherry\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_BITTER_AMPLIFY"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_BITTER_AMPLIFY",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_bitter_amplify",
            "expected": "risk_bitter_amplify",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 789,
            "output_tokens": 12,
            "latency_ms": 1189.1,
            "usd": 0.002547,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_BITTER_AMPLIFY",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_bitter_amplify",
            "expected": "risk_bitter_amplify",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 789,
            "output_tokens": 12,
            "latency_ms": 1932.1,
            "usd": 0.002547,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_BITTER_AMPLIFY",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_bitter_amplify",
            "expected": "risk_bitter_amplify",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 789,
            "output_tokens": 12,
            "latency_ms": 1615.0,
            "usd": 0.002547,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 789.0,
        "output_tokens_mean": 12.0,
        "latency_ms_mean": 1578.733,
        "usd_sum": 0.007641
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-009",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "a00498fd7fce7cb7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"raw scallop crudo with yuzu\",\n  \"beverage\": \"heavily oaked California Chardonnay, new French barrels\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_OAK_MASKS"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_OAK_MASKS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_oak_masks",
            "expected": "risk_oak_masks",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 786,
            "output_tokens": 12,
            "latency_ms": 1163.5,
            "usd": 0.002538,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_OAK_MASKS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_oak_masks",
            "expected": "risk_oak_masks",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 786,
            "output_tokens": 12,
            "latency_ms": 912.4,
            "usd": 0.002538,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_OAK_MASKS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_oak_masks",
            "expected": "risk_oak_masks",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 786,
            "output_tokens": 12,
            "latency_ms": 869.8,
            "usd": 0.002538,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 786.0,
        "output_tokens_mean": 12.0,
        "latency_ms_mean": 981.9,
        "usd_sum": 0.007614
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-010",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "b28af5fc40407ce0",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"warm clear beef consommé\",\n  \"beverage\": \"sweet late-harvest Moscato d'Asti\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_SWEETNESS_CONFLICT"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_sweetness_conflict",
            "expected": "risk_sweetness_conflict",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 781,
            "output_tokens": 12,
            "latency_ms": 928.1,
            "usd": 0.002523,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_sweetness_conflict",
            "expected": "risk_sweetness_conflict",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 781,
            "output_tokens": 12,
            "latency_ms": 1235.9,
            "usd": 0.002523,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_sweetness_conflict",
            "expected": "risk_sweetness_conflict",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 781,
            "output_tokens": 12,
            "latency_ms": 894.9,
            "usd": 0.002523,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 781.0,
        "output_tokens_mean": 12.0,
        "latency_ms_mean": 1019.633,
        "usd_sum": 0.007569
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-011",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "fba6f85afa81595f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"frisée salad with aggressive Dijon vinaigrette (1:2 vinegar to oil)\",\n  \"beverage\": \"bone-dry Brut Nature Champagne\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_ACID_CLASH"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_ACID_CLASH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_acid_clash",
            "expected": "risk_acid_clash",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 796,
            "output_tokens": 11,
            "latency_ms": 1103.2,
            "usd": 0.002553,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_ACID_CLASH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_acid_clash",
            "expected": "risk_acid_clash",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 796,
            "output_tokens": 11,
            "latency_ms": 9530.0,
            "usd": 0.002553,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_ACID_CLASH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_acid_clash",
            "expected": "risk_acid_clash",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 796,
            "output_tokens": 11,
            "latency_ms": 1371.9,
            "usd": 0.002553,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 796.0,
        "output_tokens_mean": 11.0,
        "latency_ms_mean": 4001.7,
        "usd_sum": 0.007659
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-012",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "easy",
      "cuisine_region": "italian",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "2f3a4360a39afdc9",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"steamed turbot with beurre blanc\",\n  \"beverage\": \"young powerful Barolo\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_INTENSITY_MISMATCH"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_intensity_mismatch",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 777,
            "output_tokens": 13,
            "latency_ms": 905.3,
            "usd": 0.002526,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_intensity_mismatch",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 777,
            "output_tokens": 13,
            "latency_ms": 786.7,
            "usd": 0.002526,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_intensity_mismatch",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 777,
            "output_tokens": 13,
            "latency_ms": 674.5,
            "usd": 0.002526,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 777.0,
        "output_tokens_mean": 13.0,
        "latency_ms_mean": 788.833,
        "usd_sum": 0.007578
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-013",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "390e90cae060ada8",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"yellowtail sashimi\",\n  \"beverage\": \"overproof 63% abv Jamaican rum\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_ALCOHOL_OVERPOWERS"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_ALCOHOL_OVERPOWERS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_alcohol_overpowers",
            "expected": "risk_alcohol_overpowers",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 781,
            "output_tokens": 13,
            "latency_ms": 987.0,
            "usd": 0.002538,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_ALCOHOL_OVERPOWERS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_alcohol_overpowers",
            "expected": "risk_alcohol_overpowers",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 781,
            "output_tokens": 13,
            "latency_ms": 1915.6,
            "usd": 0.002538,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_ALCOHOL_OVERPOWERS",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_alcohol_overpowers",
            "expected": "risk_alcohol_overpowers",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 781,
            "output_tokens": 13,
            "latency_ms": 5260.5,
            "usd": 0.002538,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 781.0,
        "output_tokens_mean": 13.0,
        "latency_ms_mean": 2721.033,
        "usd_sum": 0.007614
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-014",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "d83b8b2ecabe9907",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"gin-cured salmon gravlax with dill\",\n  \"beverage\": \"young tannic Chianti Classico\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_TANNIN_FISH"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_tannin_fish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 783,
            "output_tokens": 13,
            "latency_ms": 1147.3,
            "usd": 0.002544,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_tannin_fish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 783,
            "output_tokens": 13,
            "latency_ms": 1155.9,
            "usd": 0.002544,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_TANNIN_FISH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_tannin_fish",
            "expected": "risk_tannin_fish",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 783,
            "output_tokens": 13,
            "latency_ms": 1195.5,
            "usd": 0.002544,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 783.0,
        "output_tokens_mean": 13.0,
        "latency_ms_mean": 1166.233,
        "usd_sum": 0.007632
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-015",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "1e83097d483e0639",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"poached lobster with drawn butter\",\n  \"beverage\": \"aggressive 80-IBU West Coast IPA\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_BITTER_AMPLIFY"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_BITTER_AMPLIFY",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_bitter_amplify",
            "expected": "risk_bitter_amplify",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 780,
            "output_tokens": 12,
            "latency_ms": 1068.9,
            "usd": 0.00252,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_INTENSITY_MISMATCH",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_intensity_mismatch",
            "expected": "risk_bitter_amplify",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 780,
            "output_tokens": 13,
            "latency_ms": 1056.4,
            "usd": 0.002535,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_INTENSITY_MISMATCH",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "risk_intensity_mismatch",
            "expected": "risk_bitter_amplify",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 780,
            "output_tokens": 13,
            "latency_ms": 1463.6,
            "usd": 0.002535,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.3333,
      "score_summary": {
        "mean": 0.3333,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": -0.32,
        "ci95_hi": 0.9867,
        "n": 3
      },
      "score_if_valid": 0.3333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 780.0,
        "output_tokens_mean": 12.667,
        "latency_ms_mean": 1196.3,
        "usd_sum": 0.00759
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-016",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "easy",
      "cuisine_region": "mexican",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "b2e02272ca87b2d4",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"habanero-heavy Yucatán salsa\",\n  \"beverage\": \"15% abv Zinfandel\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_SPICE_ALCOHOL"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_spice_alcohol",
            "expected": "risk_spice_alcohol",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 784,
            "output_tokens": 12,
            "latency_ms": 1031.0,
            "usd": 0.002532,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_spice_alcohol",
            "expected": "risk_spice_alcohol",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 784,
            "output_tokens": 12,
            "latency_ms": 1220.6,
            "usd": 0.002532,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_SPICE_ALCOHOL",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_spice_alcohol",
            "expected": "risk_spice_alcohol",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 784,
            "output_tokens": 12,
            "latency_ms": 1590.1,
            "usd": 0.002532,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 784.0,
        "output_tokens_mean": 12.0,
        "latency_ms_mean": 1280.567,
        "usd_sum": 0.007596
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-017",
      "category": "risk",
      "capability": "risk_severity",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "12372c8de13add55",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nUsing OGS risk severity levels (low/medium/high), classify the risk severity of the described pairing. Respond with just the severity level.\n\nINPUT:\n{\n  \"dish\": \"simple grilled branzino with lemon\",\n  \"beverage\": \"heavily oaked California Chardonnay\",\n  \"risk_code\": \"RISK_OAK_MASKS\"\n}\n\nANSWER FORMAT: one_of:low,medium,high",
      "expected": {
        "value": "medium"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "high",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "high",
            "expected": "medium",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 726,
            "output_tokens": 4,
            "latency_ms": 1094.4,
            "usd": 0.002238,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "high",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "high",
            "expected": "medium",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 726,
            "output_tokens": 4,
            "latency_ms": 1232.6,
            "usd": 0.002238,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "high",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "high",
            "expected": "medium",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 726,
            "output_tokens": 4,
            "latency_ms": 1018.2,
            "usd": 0.002238,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 726.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1115.067,
        "usd_sum": 0.006714
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-018",
      "category": "risk",
      "capability": "risk_severity",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "7a97c002792dfad9",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nUsing OGS risk severity levels, classify the severity of a tannic red wine with raw anchovies.\n\nINPUT:\n{\n  \"dish\": \"raw marinated anchovies\",\n  \"beverage\": \"young tannic Cabernet Sauvignon\",\n  \"risk_code\": \"RISK_TANNIN_FISH\"\n}\n\nANSWER FORMAT: one_of:low,medium,high",
      "expected": {
        "value": "high"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "high",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "high",
            "expected": "high",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 716,
            "output_tokens": 4,
            "latency_ms": 1323.4,
            "usd": 0.002208,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "high",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "high",
            "expected": "high",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 716,
            "output_tokens": 4,
            "latency_ms": 1026.2,
            "usd": 0.002208,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "high",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "high",
            "expected": "high",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 716,
            "output_tokens": 4,
            "latency_ms": 1071.2,
            "usd": 0.002208,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 716.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1140.267,
        "usd_sum": 0.006624
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-019",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "german",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "0e3d6c3f9d86af0b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"raw Kumamoto oyster\",\n  \"beverage\": \"off-dry Spätlese Riesling with 45 g/L residual sugar\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_SWEETNESS_CONFLICT"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_sweetness_conflict",
            "expected": "risk_sweetness_conflict",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 791,
            "output_tokens": 12,
            "latency_ms": 1129.6,
            "usd": 0.002553,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_sweetness_conflict",
            "expected": "risk_sweetness_conflict",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 791,
            "output_tokens": 12,
            "latency_ms": 1455.5,
            "usd": 0.002553,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_SWEETNESS_CONFLICT",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_sweetness_conflict",
            "expected": "risk_sweetness_conflict",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 791,
            "output_tokens": 12,
            "latency_ms": 1397.9,
            "usd": 0.002553,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 791.0,
        "output_tokens_mean": 12.0,
        "latency_ms_mean": 1327.667,
        "usd_sum": 0.007659
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:risk-020",
      "category": "risk",
      "capability": "risk_identification",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.explanation_codes.negative"
      ],
      "prompt_template_hash": "2dd649d25c606ee7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (negative / risk only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n---\n\nReturn the single most important OGS negative explanation-code (risk) for the pairing below. Respond with just the code ID.\n\nINPUT:\n{\n  \"dish\": \"slow-braised oxtail stew with dark jus\",\n  \"beverage\": \"very pale Provençal rosé, 12.5% abv\"\n}\n\nANSWER FORMAT: one_of:RISK_TANNIN_FISH,RISK_ALCOHOL_OVERPOWERS,RISK_BITTER_AMPLIFY,RISK_OAK_MASKS,RISK_INTENSITY_MISMATCH,RISK_ACID_CLASH,RISK_SWEETNESS_CONFLICT,RISK_SPICE_ALCOHOL",
      "expected": {
        "value": "RISK_INTENSITY_MISMATCH"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "RISK_INTENSITY_MISMATCH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_intensity_mismatch",
            "expected": "risk_intensity_mismatch",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 792,
            "output_tokens": 13,
            "latency_ms": 1171.8,
            "usd": 0.002571,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "RISK_INTENSITY_MISMATCH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_intensity_mismatch",
            "expected": "risk_intensity_mismatch",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 792,
            "output_tokens": 13,
            "latency_ms": 1073.3,
            "usd": 0.002571,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "RISK_INTENSITY_MISMATCH",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "risk_intensity_mismatch",
            "expected": "risk_intensity_mismatch",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 792,
            "output_tokens": 13,
            "latency_ms": 2933.6,
            "usd": 0.002571,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 792.0,
        "output_tokens_mean": 13.0,
        "latency_ms_mean": 1726.233,
        "usd_sum": 0.007713
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-001",
      "category": "safety",
      "capability": "cooking_temperature",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "8e37b185cfaed2aa",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nWhat is the minimum core temperature (°C) required to safely cook poultry under the OGS HACCP reference? Respond with a single number.\n\nINPUT:\n{\n  \"product\": \"whole chicken\"\n}\n\nANSWER FORMAT: numeric:integer or decimal °C",
      "expected": {
        "value": 75,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 928,
            "output_tokens": 5,
            "latency_ms": 1768.2,
            "usd": 0.002859,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 928,
            "output_tokens": 5,
            "latency_ms": 1259.0,
            "usd": 0.002859,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 928,
            "output_tokens": 5,
            "latency_ms": 1398.3,
            "usd": 0.002859,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 928.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1475.167,
        "usd_sum": 0.008577
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-002",
      "category": "safety",
      "capability": "cooking_temperature",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "d38d6470a18f7e97",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nA restaurant serves minced-beef smash burgers. What minimum core temperature (°C) should the patties reach? Respond with a single number.\n\nINPUT:\n{\n  \"product\": \"minced beef patty\"\n}\n\nANSWER FORMAT: numeric:integer °C",
      "expected": {
        "value": 75,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 928,
            "output_tokens": 5,
            "latency_ms": 796.1,
            "usd": 0.002859,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 928,
            "output_tokens": 5,
            "latency_ms": 939.5,
            "usd": 0.002859,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 928,
            "output_tokens": 5,
            "latency_ms": 1298.8,
            "usd": 0.002859,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 928.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1011.467,
        "usd_sum": 0.008577
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-003",
      "category": "safety",
      "capability": "danger_zone",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "97f1d8e6fc44b9e2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nThe bacterial danger zone is defined between two temperatures in °C. Respond as 'LOW-HIGH' in °C (integers), e.g. '10-50'.\n\nANSWER FORMAT: exact:LOW-HIGH °C",
      "expected": {
        "value": "5-63",
        "aliases": [
          "5 to 63",
          "5-63°C",
          "5 C - 63 C"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "5-63 °C",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "5-63 °c",
            "expected": "5-63",
            "aliases": [
              "5 to 63",
              "5-63°C",
              "5 C - 63 C"
            ]
          },
          "metrics": {
            "input_tokens": 915,
            "output_tokens": 10,
            "latency_ms": 1544.4,
            "usd": 0.002895,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "5-63 °C",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "5-63 °c",
            "expected": "5-63",
            "aliases": [
              "5 to 63",
              "5-63°C",
              "5 C - 63 C"
            ]
          },
          "metrics": {
            "input_tokens": 915,
            "output_tokens": 10,
            "latency_ms": 1024.1,
            "usd": 0.002895,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "5-63 °C",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "5-63 °c",
            "expected": "5-63",
            "aliases": [
              "5 to 63",
              "5-63°C",
              "5 C - 63 C"
            ]
          },
          "metrics": {
            "input_tokens": 915,
            "output_tokens": 10,
            "latency_ms": 1131.3,
            "usd": 0.002895,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 915.0,
        "output_tokens_mean": 10.0,
        "latency_ms_mean": 1233.267,
        "usd_sum": 0.008685
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-004",
      "category": "safety",
      "capability": "cooling_rule",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "9e5770b6c7036769",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nUnder UK FSA guidance, hot food must be cooled from 63 °C down to 21 °C within how many hours? Respond with a single integer.\n\nANSWER FORMAT: numeric:integer hours",
      "expected": {
        "value": 2,
        "tolerance": 0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "2",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.0,
            "expected": 2.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 912,
            "output_tokens": 5,
            "latency_ms": 871.0,
            "usd": 0.002811,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "2",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.0,
            "expected": 2.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 912,
            "output_tokens": 5,
            "latency_ms": 1028.9,
            "usd": 0.002811,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "2",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.0,
            "expected": 2.0,
            "tolerance": 0.0
          },
          "metrics": {
            "input_tokens": 912,
            "output_tokens": 5,
            "latency_ms": 1443.7,
            "usd": 0.002811,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 912.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1114.533,
        "usd_sum": 0.008433
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-005",
      "category": "safety",
      "capability": "reheating",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "cb2ff73059921b59",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nTo what minimum core temperature (°C) must a previously cooked and chilled dish be reheated under OGS HACCP? Respond with a single number.\n\nANSWER FORMAT: numeric:integer °C",
      "expected": {
        "value": 75,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 913,
            "output_tokens": 5,
            "latency_ms": 1165.5,
            "usd": 0.002814,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 913,
            "output_tokens": 5,
            "latency_ms": 1177.2,
            "usd": 0.002814,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "75",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 75.0,
            "expected": 75.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 913,
            "output_tokens": 5,
            "latency_ms": 964.7,
            "usd": 0.002814,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 913.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1102.467,
        "usd_sum": 0.008442
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-006",
      "category": "safety",
      "capability": "hot_holding",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "b3c9f3a848d65390",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nWhat minimum temperature (°C) must hot food be held at for service? Respond with a single number.\n\nANSWER FORMAT: numeric:integer °C",
      "expected": {
        "value": 63,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "63",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 63.0,
            "expected": 63.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 899,
            "output_tokens": 5,
            "latency_ms": 760.7,
            "usd": 0.002772,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "63",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 63.0,
            "expected": 63.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 899,
            "output_tokens": 5,
            "latency_ms": 883.8,
            "usd": 0.002772,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "63",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 63.0,
            "expected": 63.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 899,
            "output_tokens": 5,
            "latency_ms": 802.4,
            "usd": 0.002772,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 899.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 815.633,
        "usd_sum": 0.008316
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-007",
      "category": "safety",
      "capability": "cold_chain",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "f120f4a38737e981",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nWhat is the maximum temperature (°C) for frozen storage under OGS HACCP? Respond with a single number (negative).\n\nANSWER FORMAT: numeric:integer °C",
      "expected": {
        "value": -18,
        "tolerance": 1
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "-18",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": -18.0,
            "expected": -18.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 905,
            "output_tokens": 5,
            "latency_ms": 1387.5,
            "usd": 0.00279,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "-18",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": -18.0,
            "expected": -18.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 905,
            "output_tokens": 5,
            "latency_ms": 1044.7,
            "usd": 0.00279,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "-18",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": -18.0,
            "expected": -18.0,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 905,
            "output_tokens": 5,
            "latency_ms": 1265.6,
            "usd": 0.00279,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 905.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1232.6,
        "usd_sum": 0.00837
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-008",
      "category": "safety",
      "capability": "thawing_method",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "9b8f975bd042c7a0",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nWhich of the following thawing methods is explicitly NOT permitted under OGS HACCP: 'in the fridge at 4 °C', 'under cold running water', 'in the microwave just before cooking', 'at room temperature on the counter'? Respond with the exact method phrase.\n\nINPUT:\n{\n  \"methods\": [\n    \"in the fridge at 4 °C\",\n    \"under cold running water\",\n    \"in the microwave just before cooking\",\n    \"at room temperature on the counter\"\n  ]\n}\n\nANSWER FORMAT: exact:method phrase",
      "expected": {
        "value": "at room temperature on the counter",
        "aliases": [
          "room temperature",
          "on the counter",
          "room-temperature thawing"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "at room temperature on the counter",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "at room temperature on the counter",
            "expected": "at room temperature on the counter",
            "aliases": [
              "room temperature",
              "on the counter",
              "room-temperature thawing"
            ]
          },
          "metrics": {
            "input_tokens": 1002,
            "output_tokens": 9,
            "latency_ms": 1358.0,
            "usd": 0.003141,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "at room temperature on the counter",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "at room temperature on the counter",
            "expected": "at room temperature on the counter",
            "aliases": [
              "room temperature",
              "on the counter",
              "room-temperature thawing"
            ]
          },
          "metrics": {
            "input_tokens": 1002,
            "output_tokens": 9,
            "latency_ms": 1153.3,
            "usd": 0.003141,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "at room temperature on the counter",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "at room temperature on the counter",
            "expected": "at room temperature on the counter",
            "aliases": [
              "room temperature",
              "on the counter",
              "room-temperature thawing"
            ]
          },
          "metrics": {
            "input_tokens": 1002,
            "output_tokens": 9,
            "latency_ms": 1864.2,
            "usd": 0.003141,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1002.0,
        "output_tokens_mean": 9.0,
        "latency_ms_mean": 1458.5,
        "usd_sum": 0.009423
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-009",
      "category": "safety",
      "capability": "ccp_identification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "set_f1",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "9f4b49c1c20e4376",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nFor a cook-chill dish of chicken curry with rice, identify the Critical Control Points from the OGS stage list. Return a comma-separated list using the OGS CCP stage IDs.\n\nINPUT:\n{\n  \"dish\": \"cook-chill chicken curry with rice\"\n}\n\nANSWER FORMAT: set:comma-separated CCP stage IDs",
      "expected": {
        "values": [
          "cooking",
          "cooling",
          "storage_cold",
          "reheating",
          "hot_holding"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "set:receiving,storage_cold,thawing,cooking,cooling,reheating,hot_holding,service",
          "compliant": true,
          "score": 0.7692,
          "detail": {
            "predicted": [
              "cooking",
              "cooling",
              "hot_holding",
              "reheating",
              "service",
              "set:receiving",
              "storage_cold",
              "thawing"
            ],
            "expected": [
              "cooking",
              "cooling",
              "hot_holding",
              "reheating",
              "storage_cold"
            ],
            "precision": 0.625,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 946,
            "output_tokens": 28,
            "latency_ms": 1586.6,
            "usd": 0.003258,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "set:receiving,storage_cold,thawing,cooking,cooling,reheating,hot_holding,service",
          "compliant": true,
          "score": 0.7692,
          "detail": {
            "predicted": [
              "cooking",
              "cooling",
              "hot_holding",
              "reheating",
              "service",
              "set:receiving",
              "storage_cold",
              "thawing"
            ],
            "expected": [
              "cooking",
              "cooling",
              "hot_holding",
              "reheating",
              "storage_cold"
            ],
            "precision": 0.625,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 946,
            "output_tokens": 28,
            "latency_ms": 2472.1,
            "usd": 0.003258,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "set:receiving,storage_cold,thawing,cooking,cooling,reheating,hot_holding,service",
          "compliant": true,
          "score": 0.7692,
          "detail": {
            "predicted": [
              "cooking",
              "cooling",
              "hot_holding",
              "reheating",
              "service",
              "set:receiving",
              "storage_cold",
              "thawing"
            ],
            "expected": [
              "cooking",
              "cooling",
              "hot_holding",
              "reheating",
              "storage_cold"
            ],
            "precision": 0.625,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 946,
            "output_tokens": 28,
            "latency_ms": 1655.3,
            "usd": 0.003258,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.7692,
      "score_summary": {
        "mean": 0.7692,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.7692,
        "ci95_hi": 0.7692,
        "n": 3
      },
      "score_if_valid": 0.7692,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 946.0,
        "output_tokens_mean": 28.0,
        "latency_ms_mean": 1904.667,
        "usd_sum": 0.009774
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-010",
      "category": "safety",
      "capability": "cooking_temperature",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "eea6814938f18565",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nA bistro serves a rare whole-muscle ribeye (seared exterior, internal 46 °C). Under OGS HACCP, is this permitted? Answer yes or no.\n\nINPUT:\n{\n  \"product\": \"whole-muscle ribeye\",\n  \"core_c\": 46\n}\n\nANSWER FORMAT: exact:yes|no",
      "expected": {
        "value": "yes"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "yes",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "yes",
            "expected": "yes",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 944,
            "output_tokens": 4,
            "latency_ms": 1286.6,
            "usd": 0.002892,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "yes",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "yes",
            "expected": "yes",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 944,
            "output_tokens": 4,
            "latency_ms": 1469.7,
            "usd": 0.002892,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "yes",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "yes",
            "expected": "yes",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 944,
            "output_tokens": 4,
            "latency_ms": 1161.5,
            "usd": 0.002892,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 944.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1305.933,
        "usd_sum": 0.008676
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-011",
      "category": "safety",
      "capability": "at_risk_groups",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "edc2981cec11c39e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nA children's hospital must serve hollandaise sauce. Which is the ONE compliant option: 'fresh shell eggs at 63 °C', 'fresh shell eggs at 70 °C', 'pasteurised egg yolk', 'raw shell eggs with lemon juice'? Respond with the exact phrase.\n\nINPUT:\n{\n  \"options\": [\n    \"fresh shell eggs at 63 °C\",\n    \"fresh shell eggs at 70 °C\",\n    \"pasteurised egg yolk\",\n    \"raw shell eggs with lemon juice\"\n  ]\n}\n\nANSWER FORMAT: exact:option phrase",
      "expected": {
        "value": "pasteurised egg yolk",
        "aliases": [
          "pasteurised egg",
          "pasteurized egg yolk",
          "pasteurized egg"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "pasteurised egg yolk",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "pasteurised egg yolk",
            "expected": "pasteurised egg yolk",
            "aliases": [
              "pasteurised egg",
              "pasteurized egg yolk",
              "pasteurized egg"
            ]
          },
          "metrics": {
            "input_tokens": 1008,
            "output_tokens": 9,
            "latency_ms": 883.9,
            "usd": 0.003159,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "pasteurised egg yolk",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "pasteurised egg yolk",
            "expected": "pasteurised egg yolk",
            "aliases": [
              "pasteurised egg",
              "pasteurized egg yolk",
              "pasteurized egg"
            ]
          },
          "metrics": {
            "input_tokens": 1008,
            "output_tokens": 9,
            "latency_ms": 1416.8,
            "usd": 0.003159,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "pasteurised egg yolk",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "pasteurised egg yolk",
            "expected": "pasteurised egg yolk",
            "aliases": [
              "pasteurised egg",
              "pasteurized egg yolk",
              "pasteurized egg"
            ]
          },
          "metrics": {
            "input_tokens": 1008,
            "output_tokens": 9,
            "latency_ms": 855.5,
            "usd": 0.003159,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1008.0,
        "output_tokens_mean": 9.0,
        "latency_ms_mean": 1052.067,
        "usd_sum": 0.009477
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:safe-012",
      "category": "safety",
      "capability": "numeric_vector",
      "difficulty": "hard",
      "cuisine_region": "global",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "rules.haccp",
        "vocab.haccp_ccp_stages"
      ],
      "prompt_template_hash": "2ba4113ffe26b2fc",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### HACCP temperature and time rules (food-safety reference)\n\nTemperatures are expressed in degrees Celsius.\n\n**Danger zone**: 5 °C to 63 °C — limit total exposure to 4 hours.\n\n**Cold chain**:\n- Chilled storage: 0 to 5 °C\n- Frozen storage: ≤ -18 °C\n- Thawing: in fridge at ≤ 5 °C, or under cold running water, or in   microwave immediately before cooking. Never at room temperature.\n\n**Cooking core temperatures (minimum, hold for ≥15 s)**:\n- Poultry, reformed meat (burgers, sausages), stuffed meat: **≥ 75 °C**\n- Minced beef (burgers served medium+ from whole muscle after searing   all external surfaces to 75 °C is acceptable): **≥ 75 °C** as a   default; 70 °C for 2 min or 65 °C for 10 min are equivalent.\n- Whole-muscle beef/lamb (steak/roast) cooked to order: any internal   temperature acceptable if the external surface has been seared.\n- Pork: **≥ 63 °C** held 3 min (USDA) or 75 °C (EU retail default).\n- Fish (non-sushi-grade): **≥ 63 °C** held 15 s.\n- Shell eggs for at-risk groups: **≥ 70 °C** or use pasteurised egg.\n\n**Hot holding**: ≥ 63 °C.\n**Cooling**: 63 °C → 21 °C within 2 hours, then 21 °C → 5 °C within 4 more hours (UK FSA), or 57 °C → 21 °C within 2 hours and 21 °C → 5 °C within 4 hours (FDA).\n**Reheating**: reach **≥ 75 °C** (UK) / 74 °C (US) throughout, within 2 hours. Reheat once only.\n\n**CCP stages**: `receiving`, `storage_cold`, `storage_dry`, `thawing`, `prep`, `cooking`, `hot_holding`, `cooling`, `reheating`, `service`, `cleaning_sanitising`.\n\n### HACCP CCP stages\n\nreceiving, storage_cold, storage_dry, thawing, prep, cooking, hot_holding, cooling, reheating, service, cleaning_sanitising\n\n---\n\nReturn the canonical HACCP reference temperatures in °C for this dish flow (poultry cook-chill operation): [cold_storage_max, poultry_core_cook_min, hot_hold_min, reheat_min, frozen_storage_max]. Respond with a JSON array of five numbers only, in that order.\n\nINPUT:\n{\n  \"flow\": \"poultry cook-chill\"\n}\n\nANSWER FORMAT: json:array of 5 numbers",
      "expected": {
        "values": [
          5,
          75,
          63,
          75,
          -18
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[5, 75, 63, 75, -18]",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 5,
                "expected": 5,
                "score": 1.0
              },
              {
                "index": 1,
                "predicted": 75,
                "expected": 75,
                "score": 1.0
              },
              {
                "index": 2,
                "predicted": 63,
                "expected": 63,
                "score": 1.0
              },
              {
                "index": 3,
                "predicted": 75,
                "expected": 75,
                "score": 1.0
              },
              {
                "index": 4,
                "predicted": -18,
                "expected": -18,
                "score": 1.0
              }
            ],
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 974,
            "output_tokens": 18,
            "latency_ms": 944.9,
            "usd": 0.003192,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[5, 75, 63, 75, -18]",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 5,
                "expected": 5,
                "score": 1.0
              },
              {
                "index": 1,
                "predicted": 75,
                "expected": 75,
                "score": 1.0
              },
              {
                "index": 2,
                "predicted": 63,
                "expected": 63,
                "score": 1.0
              },
              {
                "index": 3,
                "predicted": 75,
                "expected": 75,
                "score": 1.0
              },
              {
                "index": 4,
                "predicted": -18,
                "expected": -18,
                "score": 1.0
              }
            ],
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 974,
            "output_tokens": 18,
            "latency_ms": 1121.7,
            "usd": 0.003192,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[5, 75, 63, 75, -18]",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_position": [
              {
                "index": 0,
                "predicted": 5,
                "expected": 5,
                "score": 1.0
              },
              {
                "index": 1,
                "predicted": 75,
                "expected": 75,
                "score": 1.0
              },
              {
                "index": 2,
                "predicted": 63,
                "expected": 63,
                "score": 1.0
              },
              {
                "index": 3,
                "predicted": 75,
                "expected": 75,
                "score": 1.0
              },
              {
                "index": 4,
                "predicted": -18,
                "expected": -18,
                "score": 1.0
              }
            ],
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 974,
            "output_tokens": 18,
            "latency_ms": 1169.5,
            "usd": 0.003192,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 974.0,
        "output_tokens_mean": 18.0,
        "latency_ms_mean": 1078.7,
        "usd_sum": 0.009576
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-001",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "913118356da8150b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the umami intensity of a 36-month aged Parmigiano-Reggiano on the OGS 0-10 intensity scale (0 = not present, 10 = extreme). Respond with only a single decimal number.\n\nINPUT:\n{\n  \"ingredient\": \"Parmigiano-Reggiano 36 months\",\n  \"attribute\": \"umami\"\n}\n\nANSWER FORMAT: integer_0_10",
      "expected": {
        "value": 8.5,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "9.5",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": 9.5,
            "expected": 8.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 437,
            "output_tokens": 7,
            "latency_ms": 1198.4,
            "usd": 0.001416,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "9.5",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": 9.5,
            "expected": 8.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 437,
            "output_tokens": 7,
            "latency_ms": 4205.9,
            "usd": 0.001416,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "9.5",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": 9.5,
            "expected": 8.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 437,
            "output_tokens": 7,
            "latency_ms": 834.3,
            "usd": 0.001416,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.3333,
      "score_summary": {
        "mean": 0.3333,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.3333,
        "ci95_hi": 0.3333,
        "n": 3
      },
      "score_if_valid": 0.3333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 437.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 2079.533,
        "usd_sum": 0.004248
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-002",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "2690028d89b60292",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the sour intensity of undiluted fresh lemon juice on the OGS 0-10 intensity scale. Respond with only a single decimal number.\n\nINPUT:\n{\n  \"ingredient\": \"fresh lemon juice (undiluted)\",\n  \"attribute\": \"sour\"\n}\n\nANSWER FORMAT: integer_0_10",
      "expected": {
        "value": 9.5,
        "tolerance": 1.0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "9",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": 9.0,
            "expected": 9.5,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 412,
            "output_tokens": 5,
            "latency_ms": 1335.6,
            "usd": 0.001311,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "9",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": 9.0,
            "expected": 9.5,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 412,
            "output_tokens": 5,
            "latency_ms": 1781.8,
            "usd": 0.001311,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "9",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": 9.0,
            "expected": 9.5,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 412,
            "output_tokens": 5,
            "latency_ms": 1616.2,
            "usd": 0.001311,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.5,
      "score_summary": {
        "mean": 0.5,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.5,
        "ci95_hi": 0.5,
        "n": 3
      },
      "score_if_valid": 0.5,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 412.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1577.867,
        "usd_sum": 0.003933
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-003",
      "category": "sensory",
      "capability": "structural_estimation",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "b09502eb4fd5d61f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nEstimate the tannin intensity of a young (under 5 years old) Barolo on the OGS 0-10 intensity scale. Respond with only a single decimal number.\n\nINPUT:\n{\n  \"beverage\": \"young Barolo, <5 years\",\n  \"attribute\": \"tannin\"\n}\n\nANSWER FORMAT: integer_0_10",
      "expected": {
        "value": 8.5,
        "tolerance": 1.0
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "8",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": 8.0,
            "expected": 8.5,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 400,
            "output_tokens": 5,
            "latency_ms": 1170.9,
            "usd": 0.001275,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "8",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": 8.0,
            "expected": 8.5,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 400,
            "output_tokens": 5,
            "latency_ms": 1336.3,
            "usd": 0.001275,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "8",
          "compliant": true,
          "score": 0.5,
          "detail": {
            "predicted": 8.0,
            "expected": 8.5,
            "tolerance": 1.0
          },
          "metrics": {
            "input_tokens": 400,
            "output_tokens": 5,
            "latency_ms": 2088.8,
            "usd": 0.001275,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.5,
      "score_summary": {
        "mean": 0.5,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.5,
        "ci95_hi": 0.5,
        "n": 3
      },
      "score_if_valid": 0.5,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 400.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1532.0,
        "usd_sum": 0.003825
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-004",
      "category": "sensory",
      "capability": "basic_taste_vector",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "94f009713056d52f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nReturn the five basic tastes for a ripe, in-season strawberry, each on the OGS 0-10 intensity scale. Respond as a single JSON object with keys sweet, sour, salty, bitter, umami.\n\nINPUT:\n{\n  \"ingredient\": \"ripe, in-season strawberry\"\n}\n\nANSWER FORMAT: json\n\nRESPONSE JSON SCHEMA:\n{\n  \"type\": \"object\",\n  \"required\": [\n    \"sweet\",\n    \"sour\",\n    \"salty\",\n    \"bitter\",\n    \"umami\"\n  ],\n  \"properties\": {\n    \"sweet\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"sour\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"salty\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"bitter\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"umami\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    }\n  },\n  \"additionalProperties\": false\n}",
      "expected": {
        "values": {
          "sweet": 7.0,
          "sour": 4.0,
          "salty": 0.5,
          "bitter": 1.0,
          "umami": 0.5
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\"sweet\": 7.5, \"sour\": 3.5, \"salty\": 0.5, \"bitter\": 0.5, \"umami\": 0.5}",
          "compliant": true,
          "score": 0.85,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 7.5,
                "expected": 7.0,
                "score": 0.75
              },
              "sour": {
                "predicted": 3.5,
                "expected": 4.0,
                "score": 0.75
              },
              "salty": {
                "predicted": 0.5,
                "expected": 0.5,
                "score": 1.0
              },
              "bitter": {
                "predicted": 0.5,
                "expected": 1.0,
                "score": 0.75
              },
              "umami": {
                "predicted": 0.5,
                "expected": 0.5,
                "score": 1.0
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 652,
            "output_tokens": 46,
            "latency_ms": 1132.0,
            "usd": 0.002646,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\"sweet\": 7.5, \"sour\": 4.0, \"salty\": 0.5, \"bitter\": 0.5, \"umami\": 0.5}",
          "compliant": true,
          "score": 0.9,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 7.5,
                "expected": 7.0,
                "score": 0.75
              },
              "sour": {
                "predicted": 4.0,
                "expected": 4.0,
                "score": 1.0
              },
              "salty": {
                "predicted": 0.5,
                "expected": 0.5,
                "score": 1.0
              },
              "bitter": {
                "predicted": 0.5,
                "expected": 1.0,
                "score": 0.75
              },
              "umami": {
                "predicted": 0.5,
                "expected": 0.5,
                "score": 1.0
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 652,
            "output_tokens": 46,
            "latency_ms": 1820.0,
            "usd": 0.002646,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\"sweet\": 7.5, \"sour\": 4.0, \"salty\": 0.5, \"bitter\": 0.5, \"umami\": 0.5}",
          "compliant": true,
          "score": 0.9,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 7.5,
                "expected": 7.0,
                "score": 0.75
              },
              "sour": {
                "predicted": 4.0,
                "expected": 4.0,
                "score": 1.0
              },
              "salty": {
                "predicted": 0.5,
                "expected": 0.5,
                "score": 1.0
              },
              "bitter": {
                "predicted": 0.5,
                "expected": 1.0,
                "score": 0.75
              },
              "umami": {
                "predicted": 0.5,
                "expected": 0.5,
                "score": 1.0
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 652,
            "output_tokens": 46,
            "latency_ms": 1313.8,
            "usd": 0.002646,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.8833,
      "score_summary": {
        "mean": 0.8833,
        "stdev": 0.0289,
        "sem": 0.0167,
        "ci95_lo": 0.8507,
        "ci95_hi": 0.916,
        "n": 3
      },
      "score_if_valid": 0.8833,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 652.0,
        "output_tokens_mean": 46.0,
        "latency_ms_mean": 1421.933,
        "usd_sum": 0.007938
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-005",
      "category": "sensory",
      "capability": "structural_estimation",
      "difficulty": "medium",
      "cuisine_region": "american",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "c950ec9851ad52bf",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nEstimate the body of a New World (California) oaked Chardonnay at 14% ABV on the OGS 0-10 body scale (0 = water-thin, 10 = syrupy). Respond with only a single decimal number.\n\nINPUT:\n{\n  \"beverage\": \"oaked California Chardonnay, 14% ABV\",\n  \"attribute\": \"body\"\n}\n\nANSWER FORMAT: integer_0_10",
      "expected": {
        "value": 7.0,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "8",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": 8.0,
            "expected": 7.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 421,
            "output_tokens": 5,
            "latency_ms": 1272.0,
            "usd": 0.001338,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "8",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": 8.0,
            "expected": 7.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 421,
            "output_tokens": 5,
            "latency_ms": 1069.2,
            "usd": 0.001338,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "8",
          "compliant": true,
          "score": 0.3333,
          "detail": {
            "predicted": 8.0,
            "expected": 7.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 421,
            "output_tokens": 5,
            "latency_ms": 1786.8,
            "usd": 0.001338,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.3333,
      "score_summary": {
        "mean": 0.3333,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.3333,
        "ci95_hi": 0.3333,
        "n": 3
      },
      "score_if_valid": 0.3333,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 421.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1376.0,
        "usd_sum": 0.004014
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-006",
      "category": "sensory",
      "capability": "dominant_aromatic_family",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "d68ef8562addd55d",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nIdentify the single dominant aromatic FAMILY (from the OGS vocabulary) of a classic Marlborough Sauvignon Blanc. Respond with just the family name.\n\nINPUT:\n{\n  \"beverage\": \"Marlborough Sauvignon Blanc\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "vegetal"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "fruit",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "fruit",
            "expected": "vegetal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 386,
            "output_tokens": 4,
            "latency_ms": 1610.5,
            "usd": 0.001218,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "fruit",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "fruit",
            "expected": "vegetal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 386,
            "output_tokens": 4,
            "latency_ms": 14094.6,
            "usd": 0.001218,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "fruit",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "fruit",
            "expected": "vegetal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 386,
            "output_tokens": 4,
            "latency_ms": 1106.4,
            "usd": 0.001218,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 386.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 5603.833,
        "usd_sum": 0.003654
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-007",
      "category": "sensory",
      "capability": "structural_vector",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "658c14cce434cc0a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nReturn the OGS structural attributes of a Brut Champagne on the 0-10 scale. Respond as a JSON object with keys body, acidity_perception, tannin, astringency, alcohol_heat, effervescence.\n\nINPUT:\n{\n  \"beverage\": \"Brut non-vintage Champagne\"\n}\n\nANSWER FORMAT: json\n\nRESPONSE JSON SCHEMA:\n{\n  \"type\": \"object\",\n  \"required\": [\n    \"body\",\n    \"acidity_perception\",\n    \"tannin\",\n    \"astringency\",\n    \"alcohol_heat\",\n    \"effervescence\"\n  ],\n  \"properties\": {\n    \"body\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"acidity_perception\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"tannin\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"astringency\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"alcohol_heat\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"effervescence\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    }\n  },\n  \"additionalProperties\": false\n}",
      "expected": {
        "values": {
          "body": 4.5,
          "acidity_perception": 8.0,
          "tannin": 0.5,
          "astringency": 1.0,
          "alcohol_heat": 3.5,
          "effervescence": 9.0
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"body\": 4,\n  \"acidity_perception\": 7,\n  \"tannin\": 1,\n  \"astringency\": 1,\n  \"alcohol_heat\": 3,\n  \"effervescence\": 8\n}",
          "compliant": true,
          "score": 0.7083,
          "detail": {
            "per_key": {
              "body": {
                "predicted": 4,
                "expected": 4.5,
                "score": 0.75
              },
              "acidity_perception": {
                "predicted": 7,
                "expected": 8.0,
                "score": 0.5
              },
              "tannin": {
                "predicted": 1,
                "expected": 0.5,
                "score": 0.75
              },
              "astringency": {
                "predicted": 1,
                "expected": 1.0,
                "score": 1.0
              },
              "alcohol_heat": {
                "predicted": 3,
                "expected": 3.5,
                "score": 0.75
              },
              "effervescence": {
                "predicted": 8,
                "expected": 9.0,
                "score": 0.5
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 687,
            "output_tokens": 64,
            "latency_ms": 1676.7,
            "usd": 0.003021,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"body\": 4,\n  \"acidity_perception\": 7,\n  \"tannin\": 1,\n  \"astringency\": 1,\n  \"alcohol_heat\": 3,\n  \"effervescence\": 8\n}",
          "compliant": true,
          "score": 0.7083,
          "detail": {
            "per_key": {
              "body": {
                "predicted": 4,
                "expected": 4.5,
                "score": 0.75
              },
              "acidity_perception": {
                "predicted": 7,
                "expected": 8.0,
                "score": 0.5
              },
              "tannin": {
                "predicted": 1,
                "expected": 0.5,
                "score": 0.75
              },
              "astringency": {
                "predicted": 1,
                "expected": 1.0,
                "score": 1.0
              },
              "alcohol_heat": {
                "predicted": 3,
                "expected": 3.5,
                "score": 0.75
              },
              "effervescence": {
                "predicted": 8,
                "expected": 9.0,
                "score": 0.5
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 687,
            "output_tokens": 64,
            "latency_ms": 1440.0,
            "usd": 0.003021,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"body\": 4,\n  \"acidity_perception\": 7,\n  \"tannin\": 1,\n  \"astringency\": 1,\n  \"alcohol_heat\": 3,\n  \"effervescence\": 8\n}",
          "compliant": true,
          "score": 0.7083,
          "detail": {
            "per_key": {
              "body": {
                "predicted": 4,
                "expected": 4.5,
                "score": 0.75
              },
              "acidity_perception": {
                "predicted": 7,
                "expected": 8.0,
                "score": 0.5
              },
              "tannin": {
                "predicted": 1,
                "expected": 0.5,
                "score": 0.75
              },
              "astringency": {
                "predicted": 1,
                "expected": 1.0,
                "score": 1.0
              },
              "alcohol_heat": {
                "predicted": 3,
                "expected": 3.5,
                "score": 0.75
              },
              "effervescence": {
                "predicted": 8,
                "expected": 9.0,
                "score": 0.5
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 687,
            "output_tokens": 64,
            "latency_ms": 1267.8,
            "usd": 0.003021,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.7083,
      "score_summary": {
        "mean": 0.7083,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.7083,
        "ci95_hi": 0.7083,
        "n": 3
      },
      "score_if_valid": 0.7083,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 687.0,
        "output_tokens_mean": 64.0,
        "latency_ms_mean": 1461.5,
        "usd_sum": 0.009063
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-008",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "873c6c2f9608e43f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the bitter intensity of raw radicchio on the OGS 0-10 intensity scale. Respond with only a single decimal number.\n\nINPUT:\n{\n  \"ingredient\": \"raw radicchio\",\n  \"attribute\": \"bitter\"\n}\n\nANSWER FORMAT: integer_0_10",
      "expected": {
        "value": 6.5,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "7",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": 7.0,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 403,
            "output_tokens": 5,
            "latency_ms": 1068.5,
            "usd": 0.001284,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "7",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": 7.0,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 403,
            "output_tokens": 5,
            "latency_ms": 1467.5,
            "usd": 0.001284,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "7",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": 7.0,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 403,
            "output_tokens": 5,
            "latency_ms": 1132.3,
            "usd": 0.001284,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.6667,
        "ci95_hi": 0.6667,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 403.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1222.767,
        "usd_sum": 0.003852
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-009",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "523e04f39252cb4b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the umami intensity of a plain 10-minute cold-steep kombu dashi on the OGS 0-10 scale. Respond with only a single decimal number.\n\nINPUT:\n{\n  \"ingredient\": \"kombu dashi (cold steep 10 min, 10 g/L Hokkaido kombu)\",\n  \"attribute\": \"umami\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 6.5,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "6.5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 6.5,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 433,
            "output_tokens": 7,
            "latency_ms": 1047.6,
            "usd": 0.001404,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "6.5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 6.5,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 433,
            "output_tokens": 7,
            "latency_ms": 785.7,
            "usd": 0.001404,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "6.5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 6.5,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 433,
            "output_tokens": 7,
            "latency_ms": 983.5,
            "usd": 0.001404,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 433.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 938.933,
        "usd_sum": 0.004212
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-010",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "1813cae53ba17a84",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the bitter intensity of a square of unsweetened 85% cacao dark chocolate on the OGS 0-10 scale.\n\nINPUT:\n{\n  \"ingredient\": \"single-origin 85% dark chocolate\",\n  \"attribute\": \"bitter\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 6.5,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "8.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 8.5,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 406,
            "output_tokens": 7,
            "latency_ms": 1458.0,
            "usd": 0.001323,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "8.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 8.5,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 406,
            "output_tokens": 7,
            "latency_ms": 1077.9,
            "usd": 0.001323,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "8.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": 8.5,
            "expected": 6.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 406,
            "output_tokens": 7,
            "latency_ms": 4648.1,
            "usd": 0.001323,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 406.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 2394.667,
        "usd_sum": 0.003969
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-011",
      "category": "sensory",
      "capability": "structural_estimation",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "1886b2e009716d1d",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nEstimate the perceived acidity of freshly squeezed yuzu juice on the OGS 0-10 scale.\n\nINPUT:\n{\n  \"ingredient\": \"fresh yuzu juice\",\n  \"attribute\": \"acidity_perception\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 9.0,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "8.5",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": 8.5,
            "expected": 9.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 379,
            "output_tokens": 7,
            "latency_ms": 1977.0,
            "usd": 0.001242,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "8.5",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": 8.5,
            "expected": 9.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 379,
            "output_tokens": 7,
            "latency_ms": 3189.9,
            "usd": 0.001242,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "8.5",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": 8.5,
            "expected": 9.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 379,
            "output_tokens": 7,
            "latency_ms": 1387.9,
            "usd": 0.001242,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.6667,
        "ci95_hi": 0.6667,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 379.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 2184.933,
        "usd_sum": 0.003726
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-012",
      "category": "sensory",
      "capability": "basic_taste_vector",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "8d40a83f1b501685",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nReturn the five basic tastes of fresh-squeezed Valencia orange juice on the OGS 0-10 scale as a JSON object with keys sweet, sour, salty, bitter, umami. Respond with ONLY the JSON.\n\nINPUT:\n{\n  \"ingredient\": \"freshly squeezed Valencia orange juice\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "values": {
          "sweet": 7.0,
          "sour": 5.5,
          "salty": 0.0,
          "bitter": 0.5,
          "umami": 0.0
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\"sweet\": 6.5, \"sour\": 5.5, \"salty\": 0.5, \"bitter\": 1.5, \"umami\": 0.5}",
          "compliant": true,
          "score": 0.75,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 6.5,
                "expected": 7.0,
                "score": 0.75
              },
              "sour": {
                "predicted": 5.5,
                "expected": 5.5,
                "score": 1.0
              },
              "salty": {
                "predicted": 0.5,
                "expected": 0.0,
                "score": 0.75
              },
              "bitter": {
                "predicted": 1.5,
                "expected": 0.5,
                "score": 0.5
              },
              "umami": {
                "predicted": 0.5,
                "expected": 0.0,
                "score": 0.75
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 416,
            "output_tokens": 46,
            "latency_ms": 1494.6,
            "usd": 0.001938,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\"sweet\": 6.5, \"sour\": 5.5, \"salty\": 0.5, \"bitter\": 1.5, \"umami\": 0.5}",
          "compliant": true,
          "score": 0.75,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 6.5,
                "expected": 7.0,
                "score": 0.75
              },
              "sour": {
                "predicted": 5.5,
                "expected": 5.5,
                "score": 1.0
              },
              "salty": {
                "predicted": 0.5,
                "expected": 0.0,
                "score": 0.75
              },
              "bitter": {
                "predicted": 1.5,
                "expected": 0.5,
                "score": 0.5
              },
              "umami": {
                "predicted": 0.5,
                "expected": 0.0,
                "score": 0.75
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 416,
            "output_tokens": 46,
            "latency_ms": 1285.6,
            "usd": 0.001938,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\"sweet\": 6.5, \"sour\": 5.5, \"salty\": 0.5, \"bitter\": 1.5, \"umami\": 0.5}",
          "compliant": true,
          "score": 0.75,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 6.5,
                "expected": 7.0,
                "score": 0.75
              },
              "sour": {
                "predicted": 5.5,
                "expected": 5.5,
                "score": 1.0
              },
              "salty": {
                "predicted": 0.5,
                "expected": 0.0,
                "score": 0.75
              },
              "bitter": {
                "predicted": 1.5,
                "expected": 0.5,
                "score": 0.5
              },
              "umami": {
                "predicted": 0.5,
                "expected": 0.0,
                "score": 0.75
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 416,
            "output_tokens": 46,
            "latency_ms": 7043.5,
            "usd": 0.001938,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.75,
      "score_summary": {
        "mean": 0.75,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.75,
        "ci95_hi": 0.75,
        "n": 3
      },
      "score_if_valid": 0.75,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 416.0,
        "output_tokens_mean": 46.0,
        "latency_ms_mean": 3274.567,
        "usd_sum": 0.005814
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-013",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "medium",
      "cuisine_region": "middle_eastern",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "691f59088f3e98f3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the sweet intensity of a fresh Medjool date on the OGS 0-10 scale.\n\nINPUT:\n{\n  \"ingredient\": \"Medjool date, soft and ripe\",\n  \"attribute\": \"sweet\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 9.0,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "9.2",
          "compliant": true,
          "score": 0.8667,
          "detail": {
            "predicted": 9.2,
            "expected": 9.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 402,
            "output_tokens": 7,
            "latency_ms": 2193.9,
            "usd": 0.001311,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "9.2",
          "compliant": true,
          "score": 0.8667,
          "detail": {
            "predicted": 9.2,
            "expected": 9.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 402,
            "output_tokens": 7,
            "latency_ms": 1268.6,
            "usd": 0.001311,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "9.2",
          "compliant": true,
          "score": 0.8667,
          "detail": {
            "predicted": 9.2,
            "expected": 9.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 402,
            "output_tokens": 7,
            "latency_ms": 1175.3,
            "usd": 0.001311,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.8667,
      "score_summary": {
        "mean": 0.8667,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.8667,
        "ci95_hi": 0.8667,
        "n": 3
      },
      "score_if_valid": 0.8667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 402.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1545.933,
        "usd_sum": 0.003933
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-014",
      "category": "sensory",
      "capability": "dominant_aromatic_family",
      "difficulty": "medium",
      "cuisine_region": "scottish",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "c4a83a79265c0e9f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nIdentify the single dominant aromatic FAMILY (OGS vocabulary) of a heavily-peated Islay single malt (e.g. Ardbeg 10).\n\nINPUT:\n{\n  \"item\": \"heavily peated Islay single malt whisky, ~50 ppm phenols\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "wood"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "earth",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "earth",
            "expected": "wood",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 396,
            "output_tokens": 4,
            "latency_ms": 1371.6,
            "usd": 0.001248,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "earth",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "earth",
            "expected": "wood",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 396,
            "output_tokens": 4,
            "latency_ms": 1121.3,
            "usd": 0.001248,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "earth",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "earth",
            "expected": "wood",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 396,
            "output_tokens": 4,
            "latency_ms": 1122.2,
            "usd": 0.001248,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 396.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1205.033,
        "usd_sum": 0.003744
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-015",
      "category": "sensory",
      "capability": "structural_vector",
      "difficulty": "hard",
      "cuisine_region": "italian",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "66d64ca2254c09ee",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nReturn the OGS structural attributes of a 5-year-old Amarone della Valpolicella on the 0-10 scale. Respond with a JSON object with keys body, acidity_perception, tannin, astringency, alcohol_heat, effervescence. JSON only.\n\nINPUT:\n{\n  \"item\": \"Amarone della Valpolicella Classico, 5 years old, 16% abv\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "values": {
          "body": 9.0,
          "acidity_perception": 6.0,
          "tannin": 7.0,
          "astringency": 5.5,
          "alcohol_heat": 8.5,
          "effervescence": 0.0
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\"body\": 9, \"acidity_perception\": 6, \"tannin\": 8, \"astringency\": 7, \"alcohol_heat\": 8, \"effervescence\": 0}",
          "compliant": true,
          "score": 0.75,
          "detail": {
            "per_key": {
              "body": {
                "predicted": 9,
                "expected": 9.0,
                "score": 1.0
              },
              "acidity_perception": {
                "predicted": 6,
                "expected": 6.0,
                "score": 1.0
              },
              "tannin": {
                "predicted": 8,
                "expected": 7.0,
                "score": 0.5
              },
              "astringency": {
                "predicted": 7,
                "expected": 5.5,
                "score": 0.25
              },
              "alcohol_heat": {
                "predicted": 8,
                "expected": 8.5,
                "score": 0.75
              },
              "effervescence": {
                "predicted": 0,
                "expected": 0.0,
                "score": 1.0
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 424,
            "output_tokens": 50,
            "latency_ms": 1267.6,
            "usd": 0.002022,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\"body\": 9, \"acidity_perception\": 6, \"tannin\": 8, \"astringency\": 7, \"alcohol_heat\": 8, \"effervescence\": 0}",
          "compliant": true,
          "score": 0.75,
          "detail": {
            "per_key": {
              "body": {
                "predicted": 9,
                "expected": 9.0,
                "score": 1.0
              },
              "acidity_perception": {
                "predicted": 6,
                "expected": 6.0,
                "score": 1.0
              },
              "tannin": {
                "predicted": 8,
                "expected": 7.0,
                "score": 0.5
              },
              "astringency": {
                "predicted": 7,
                "expected": 5.5,
                "score": 0.25
              },
              "alcohol_heat": {
                "predicted": 8,
                "expected": 8.5,
                "score": 0.75
              },
              "effervescence": {
                "predicted": 0,
                "expected": 0.0,
                "score": 1.0
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 424,
            "output_tokens": 50,
            "latency_ms": 1509.2,
            "usd": 0.002022,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\"body\": 9, \"acidity_perception\": 6, \"tannin\": 8, \"astringency\": 7, \"alcohol_heat\": 8, \"effervescence\": 0}",
          "compliant": true,
          "score": 0.75,
          "detail": {
            "per_key": {
              "body": {
                "predicted": 9,
                "expected": 9.0,
                "score": 1.0
              },
              "acidity_perception": {
                "predicted": 6,
                "expected": 6.0,
                "score": 1.0
              },
              "tannin": {
                "predicted": 8,
                "expected": 7.0,
                "score": 0.5
              },
              "astringency": {
                "predicted": 7,
                "expected": 5.5,
                "score": 0.25
              },
              "alcohol_heat": {
                "predicted": 8,
                "expected": 8.5,
                "score": 0.75
              },
              "effervescence": {
                "predicted": 0,
                "expected": 0.0,
                "score": 1.0
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 424,
            "output_tokens": 50,
            "latency_ms": 1390.7,
            "usd": 0.002022,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.75,
      "score_summary": {
        "mean": 0.75,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.75,
        "ci95_hi": 0.75,
        "n": 3
      },
      "score_if_valid": 0.75,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 424.0,
        "output_tokens_mean": 50.0,
        "latency_ms_mean": 1389.167,
        "usd_sum": 0.006066
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-016",
      "category": "sensory",
      "capability": "basic_taste_estimation",
      "difficulty": "medium",
      "cuisine_region": "spanish",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "70318c6fd3b033d2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nEstimate the umami intensity of 24-month acorn-finished jamón ibérico de bellota on the OGS 0-10 scale.\n\nINPUT:\n{\n  \"ingredient\": \"jamón ibérico de bellota, 24 months, acorn-finished\",\n  \"attribute\": \"umami\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 8.0,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "9.2",
          "compliant": true,
          "score": 0.2,
          "detail": {
            "predicted": 9.2,
            "expected": 8.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 421,
            "output_tokens": 7,
            "latency_ms": 1336.3,
            "usd": 0.001368,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "9.2",
          "compliant": true,
          "score": 0.2,
          "detail": {
            "predicted": 9.2,
            "expected": 8.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 421,
            "output_tokens": 7,
            "latency_ms": 1276.0,
            "usd": 0.001368,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "9.2",
          "compliant": true,
          "score": 0.2,
          "detail": {
            "predicted": 9.2,
            "expected": 8.0,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 421,
            "output_tokens": 7,
            "latency_ms": 1194.7,
            "usd": 0.001368,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.2,
      "score_summary": {
        "mean": 0.2,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.2,
        "ci95_hi": 0.2,
        "n": 3
      },
      "score_if_valid": 0.2,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 421.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1269.0,
        "usd_sum": 0.004104
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-017",
      "category": "sensory",
      "capability": "dominant_aromatic_family",
      "difficulty": "medium",
      "cuisine_region": "thai",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "c3cedde89152ce76",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nIdentify the single dominant aromatic FAMILY of fresh Thai basil (horapa) leaves.\n\nINPUT:\n{\n  \"item\": \"fresh Thai basil (horapa), whole leaves\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "herbal"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "herbal",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herbal",
            "expected": "herbal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 368,
            "output_tokens": 5,
            "latency_ms": 2518.4,
            "usd": 0.001179,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "herbal",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herbal",
            "expected": "herbal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 368,
            "output_tokens": 5,
            "latency_ms": 960.3,
            "usd": 0.001179,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "herbal",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herbal",
            "expected": "herbal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 368,
            "output_tokens": 5,
            "latency_ms": 1100.4,
            "usd": 0.001179,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 368.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1526.367,
        "usd_sum": 0.003537
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-018",
      "category": "sensory",
      "capability": "structural_estimation",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "7d31de11173a7fd2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nEstimate the body of a dry Provençal rosé (e.g. Côtes de Provence Grenache-led) on the OGS 0-10 scale.\n\nINPUT:\n{\n  \"item\": \"dry Provençal rosé, 12.5% abv, minimal skin contact\",\n  \"attribute\": \"body\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 3.5,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "3.5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 3.5,
            "expected": 3.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 406,
            "output_tokens": 7,
            "latency_ms": 1042.7,
            "usd": 0.001323,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "3.5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 3.5,
            "expected": 3.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 406,
            "output_tokens": 7,
            "latency_ms": 988.2,
            "usd": 0.001323,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "3.5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 3.5,
            "expected": 3.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 406,
            "output_tokens": 7,
            "latency_ms": 1139.0,
            "usd": 0.001323,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 406.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1056.633,
        "usd_sum": 0.003969
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-019",
      "category": "sensory",
      "capability": "basic_taste_vector",
      "difficulty": "hard",
      "cuisine_region": "korean",
      "scorer": "numeric_vector_mae",
      "context_needs": [
        "scales.basic_tastes"
      ],
      "prompt_template_hash": "2e7df899a0148ebd",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n---\n\nReturn the five basic tastes of well-fermented napa kimchi (6 weeks at 4°C after 2 days at room temp) as a JSON object.\n\nINPUT:\n{\n  \"ingredient\": \"napa kimchi, 6 weeks fermented, ripe\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "values": {
          "sweet": 1.0,
          "sour": 7.0,
          "salty": 6.0,
          "bitter": 1.5,
          "umami": 7.0
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"sweet\": 1.5,\n  \"sour\": 7.5,\n  \"salty\": 6.5,\n  \"bitter\": 1.0,\n  \"umami\": 6.0\n}",
          "compliant": true,
          "score": 0.7,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 1.5,
                "expected": 1.0,
                "score": 0.75
              },
              "sour": {
                "predicted": 7.5,
                "expected": 7.0,
                "score": 0.75
              },
              "salty": {
                "predicted": 6.5,
                "expected": 6.0,
                "score": 0.75
              },
              "bitter": {
                "predicted": 1.0,
                "expected": 1.5,
                "score": 0.75
              },
              "umami": {
                "predicted": 6.0,
                "expected": 7.0,
                "score": 0.5
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 407,
            "output_tokens": 58,
            "latency_ms": 2756.1,
            "usd": 0.002091,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"sweet\": 1.5,\n  \"sour\": 7.5,\n  \"salty\": 6.5,\n  \"bitter\": 1.0,\n  \"umami\": 6.0\n}",
          "compliant": true,
          "score": 0.7,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 1.5,
                "expected": 1.0,
                "score": 0.75
              },
              "sour": {
                "predicted": 7.5,
                "expected": 7.0,
                "score": 0.75
              },
              "salty": {
                "predicted": 6.5,
                "expected": 6.0,
                "score": 0.75
              },
              "bitter": {
                "predicted": 1.0,
                "expected": 1.5,
                "score": 0.75
              },
              "umami": {
                "predicted": 6.0,
                "expected": 7.0,
                "score": 0.5
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 407,
            "output_tokens": 58,
            "latency_ms": 1573.0,
            "usd": 0.002091,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"sweet\": 1.5,\n  \"sour\": 7.5,\n  \"salty\": 6.5,\n  \"bitter\": 1.0,\n  \"umami\": 6.0\n}",
          "compliant": true,
          "score": 0.7,
          "detail": {
            "per_key": {
              "sweet": {
                "predicted": 1.5,
                "expected": 1.0,
                "score": 0.75
              },
              "sour": {
                "predicted": 7.5,
                "expected": 7.0,
                "score": 0.75
              },
              "salty": {
                "predicted": 6.5,
                "expected": 6.0,
                "score": 0.75
              },
              "bitter": {
                "predicted": 1.0,
                "expected": 1.5,
                "score": 0.75
              },
              "umami": {
                "predicted": 6.0,
                "expected": 7.0,
                "score": 0.5
              }
            },
            "tolerance": 2.0
          },
          "metrics": {
            "input_tokens": 407,
            "output_tokens": 58,
            "latency_ms": 1652.9,
            "usd": 0.002091,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.7,
      "score_summary": {
        "mean": 0.7,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.7,
        "ci95_hi": 0.7,
        "n": 3
      },
      "score_if_valid": 0.7,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 407.0,
        "output_tokens_mean": 58.0,
        "latency_ms_mean": 1994.0,
        "usd_sum": 0.006273
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:sense-020",
      "category": "sensory",
      "capability": "structural_estimation",
      "difficulty": "medium",
      "cuisine_region": "portuguese",
      "scorer": "numeric_mae",
      "context_needs": [
        "scales.structural"
      ],
      "prompt_template_hash": "ca5b9ab85fe71acb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n---\n\nEstimate the effervescence of a typical young Vinho Verde (traditional style with slight spritz) on the OGS 0-10 scale.\n\nINPUT:\n{\n  \"item\": \"young Vinho Verde, slightly spritzig\",\n  \"attribute\": \"effervescence\"\n}\n\nANSWER FORMAT: decimal_0_10",
      "expected": {
        "value": 2.5,
        "tolerance": 1.5
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "2.5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.5,
            "expected": 2.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 391,
            "output_tokens": 7,
            "latency_ms": 2237.8,
            "usd": 0.001278,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "2.5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.5,
            "expected": 2.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 391,
            "output_tokens": 7,
            "latency_ms": 1261.6,
            "usd": 0.001278,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "2.5",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": 2.5,
            "expected": 2.5,
            "tolerance": 1.5
          },
          "metrics": {
            "input_tokens": 391,
            "output_tokens": 7,
            "latency_ms": 1080.9,
            "usd": 0.001278,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 391.0,
        "output_tokens_mean": 7.0,
        "latency_ms_mean": 1526.767,
        "usd_sum": 0.003834
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-001",
      "category": "structured_output",
      "capability": "schema_valid_ingredient",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.ingredient",
        "vocab.ingredient_categories",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "4523d68b7da194f7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS ingredient schema (`schema/ingredient.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/ingredient.schema.json\",\n  \"title\": \"OGS Ingredient\",\n  \"description\": \"An ingredient entity with sensory profile, category, and provenance metadata.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"category\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\",\n      \"description\": \"OGS identifier in the format ogs:<namespace>:ingredient:<id>\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\",\n      \"description\": \"OGS specification version (0.1.x or 0.2.x)\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\",\n      \"description\": \"Authoritative / endonym language of this entity's human-readable fields.\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      },\n      \"description\": \"Advisory list of languages covered by this entity's LString fields.\"\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"category\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"protein\",\n        \"vegetable\",\n        \"fruit\",\n        \"grain\",\n        \"dairy\",\n        \"fat\",\n        \"herb\",\n        \"spice\",\n        \"fungus\",\n        \"legume\",\n        \"nut\",\n        \"seed\",\n        \"sweetener\",\n        \"acid\",\n        \"salt\",\n        \"other\"\n      ]\n    },\n    \"subcategory\": {\n      \"type\": \"string\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"origin\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"region\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"season\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"terroir_notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document that describes 'fleur de sel' as an OGS ingredient. It MUST validate against the OGS ingredient schema (schema/ingredient.schema.json). Respond with ONLY the JSON object, no markdown fences.\n\nINPUT:\n{\n  \"ingredient_name\": \"Fleur de sel\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"fleur-de-sel\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/ingredient.schema.json",
        "required_fields": {
          "id": "ogs:core:ingredient:fleur-de-sel",
          "ogs_version": "0.1.0",
          "category": "salt"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:fleur-de-sel\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"fr\",\n  \"available_languages\": [\"fr\", \"en\"],\n  \"name\": {\n    \"fr\": \"Fleur de sel\",\n    \"en\": \"Fleur de sel\"\n  },\n  \"description\": {\n    \"fr\": \"Sel marin récolté à la main à la surface des marais salants, reconnu pour sa texture délicate et son goût subtil.\",\n    \"en\": \"Hand-harvested sea salt skimmed from the surface of salt marshes, prized for its delicate texture and subtle flavour.\"\n  },\n  \"category\": \"salt\",\n  \"subcategory\": \"finishing salt\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 0,\n      \"sour\": 0,\n      \"salty\": 8.5,\n      \"bitter\": 0.5,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 2.0,\n      \"acidity_perception\": 0,\n      \"tannin\": 0,\n      \"astringency\": 0.5,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"marine\",\n        \"group\": \"sea minerals\",\n        \"item\": \"ocean brine\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"mineral stone\",\n        \"intensity\": 3.0\n      }\n    ],\n    \"texture\": [\"flaky\", \"crunchy\", \"delicate\", \"moist\"],\n    \"finish_length\": 4.0,\n    \"complexity\": 3.5\n  },\n  \"origin\": {\n    \"region\": {\n      \"fr\": \"Guérande, Bretagne, France\",\n      \"en\": \"Guérande, Brittany, France\"\n    },\n    \"season\": {\n      \"fr\": \"Été\",\n      \"en\": \"Summer\"\n    },\n    \"terroir_notes\": {\n      \"fr\": \"Récolté dans les marais salants de Guérande, avec une teneur élevée en magnésium et oligoéléments.\",\n      \"en\": \"Harvested from the salt marshes of Guérande, with high magnesium content and trace minerals.\"\n    }\n  },\n  \"notes\": {\n    \"fr\": \"Utilisé principalement comme sel de finition pour les plats sucrés et salés.\",\n    \"en\": \"Used primarily as a finishing salt for both sweet and savoury dishes.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Vocabulary\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:fleur-de-sel",
                "expected": "ogs:core:ingredient:fleur-de-sel",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "salt",
                "expected": "salt",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3081,
            "output_tokens": 778,
            "latency_ms": 9582.7,
            "usd": 0.020913,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:fleur-de-sel\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"fr\",\n  \"available_languages\": [\"fr\", \"en\"],\n  \"name\": {\n    \"fr\": \"Fleur de sel\",\n    \"en\": \"Fleur de sel\"\n  },\n  \"description\": {\n    \"fr\": \"Sel marin de haute qualité récolté à la main à la surface des marais salants.\",\n    \"en\": \"High-quality sea salt hand-harvested from the surface of salt evaporation ponds.\"\n  },\n  \"category\": \"salt\",\n  \"subcategory\": \"sea salt\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"salty\": 8.5,\n      \"sweet\": 0.5,\n      \"bitter\": 0.5,\n      \"umami\": 1.0,\n      \"sour\": 0.0\n    },\n    \"structural\": {\n      \"body\": 3.0,\n      \"acidity_perception\": 0.5,\n      \"astringency\": 0.5\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"marine\",\n        \"group\": \"oceanic\",\n        \"item\": \"sea breeze\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"mineral\",\n        \"intensity\": 3.5\n      }\n    ],\n    \"texture\": [\"flaky\", \"crunchy\", \"delicate\", \"crystalline\"],\n    \"finish_length\": 5.0,\n    \"complexity\": 4.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"fr\": \"Guérande, Bretagne, France\",\n      \"en\": \"Guérande, Brittany, France\"\n    },\n    \"terroir_notes\": {\n      \"fr\": \"Récolté dans les marais salants de Guérande, avec une teneur élevée en minéraux et une texture délicate.\",\n      \"en\": \"Harvested from the salt marshes of Guérande, with high mineral content and a delicate texture.\"\n    }\n  },\n  \"notes\": {\n    \"fr\": \"Fleur de sel est souvent utilisée comme sel de finition pour rehausser les plats.\",\n    \"en\": \"Fleur de sel is often used as a finishing salt to enhance dishes.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Ingredient Library\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:fleur-de-sel",
                "expected": "ogs:core:ingredient:fleur-de-sel",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "salt",
                "expected": "salt",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3081,
            "output_tokens": 711,
            "latency_ms": 8477.3,
            "usd": 0.019908,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:fleur-de-sel\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"fr\",\n  \"available_languages\": [\"fr\", \"en\"],\n  \"name\": {\n    \"fr\": \"Fleur de sel\",\n    \"en\": \"Fleur de sel\"\n  },\n  \"description\": {\n    \"fr\": \"Sel marin de haute qualité récolté à la main à la surface des marais salants.\",\n    \"en\": \"High-quality sea salt hand-harvested from the surface of salt marshes.\"\n  },\n  \"category\": \"salt\",\n  \"subcategory\": \"sea salt\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"salty\": 8.5,\n      \"sweet\": 0.5,\n      \"sour\": 0,\n      \"bitter\": 0.5,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 3.0,\n      \"acidity_perception\": 0.5,\n      \"tannin\": 0,\n      \"astringency\": 0.5,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"marine\",\n        \"group\": \"sea\",\n        \"item\": \"ocean brine\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"mineral\",\n        \"intensity\": 4.0\n      }\n    ],\n    \"texture\": [\"crunchy\", \"flaky\", \"delicate\"],\n    \"finish_length\": 5.0,\n    \"complexity\": 4.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"fr\": \"Guérande, Bretagne, France\",\n      \"en\": \"Guérande, Brittany, France\"\n    },\n    \"season\": {\n      \"fr\": \"Été\",\n      \"en\": \"Summer\"\n    },\n    \"terroir_notes\": {\n      \"fr\": \"Récoltée dans les marais salants de Guérande, cette fleur de sel est renommée pour sa pureté et ses notes marines délicates.\",\n      \"en\": \"Harvested from the salt marshes of Guérande, this fleur de sel is renowned for its purity and delicate marine notes.\"\n    }\n  },\n  \"notes\": {\n    \"fr\": \"Utilisée comme sel de finition pour rehausser les plats sans les cuire.\",\n    \"en\": \"Used as a finishing salt to enhance dishes without cooking it.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Vocabulary\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:fleur-de-sel",
                "expected": "ogs:core:ingredient:fleur-de-sel",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "salt",
                "expected": "salt",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3081,
            "output_tokens": 761,
            "latency_ms": 9215.4,
            "usd": 0.020658,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3081.0,
        "output_tokens_mean": 750.0,
        "latency_ms_mean": 9091.8,
        "usd_sum": 0.061479
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-002",
      "category": "structured_output",
      "capability": "schema_valid_beverage",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.beverage",
        "vocab.beverage_types",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "367b58556573440d",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS beverage schema (`schema/beverage.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/beverage.schema.json\",\n  \"title\": \"OGS Beverage\",\n  \"description\": \"A beverage entity with type, structural attributes, sensory profile, and provenance.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"type\",\n    \"sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"type\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"wine\",\n        \"beer\",\n        \"spirit\",\n        \"cider\",\n        \"sake\",\n        \"other\"\n      ]\n    },\n    \"subtype\": {\n      \"type\": \"string\"\n    },\n    \"grape_varieties\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"type\": \"string\"\n      }\n    },\n    \"vintage\": {\n      \"type\": \"integer\"\n    },\n    \"region\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"appellation\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"producer\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"abv\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"residual_sugar\": {\n      \"type\": \"number\",\n      \"minimum\": 0\n    },\n    \"serve_temperature_c\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"min\": {\n          \"type\": \"number\"\n        },\n        \"max\": {\n          \"type\": \"number\"\n        }\n      },\n      \"additionalProperties\": false,\n      \"description\": \"Recommended serving temperature range in degrees Celsius (SI canonical).\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document describing a Sancerre (Sauvignon Blanc, Loire Valley) that validates against the OGS beverage schema. Respond with ONLY the JSON object, no markdown fences.\n\nINPUT:\n{\n  \"beverage_name\": \"Sancerre\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"sancerre-generic\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/beverage.schema.json",
        "required_fields": {
          "id": "ogs:core:beverage:sancerre-generic",
          "ogs_version": "0.1.0",
          "type": "wine"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:sancerre-generic\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"fr\"],\n  \"name\": {\n    \"en\": \"Sancerre\",\n    \"fr\": \"Sancerre\"\n  },\n  \"description\": {\n    \"en\": \"A classic white wine from the Sancerre appellation in the Loire Valley, France, made exclusively from Sauvignon Blanc. Known for its crisp acidity, mineral character, and vibrant citrus and herbaceous aromas.\"\n  },\n  \"type\": \"wine\",\n  \"subtype\": \"white\",\n  \"grape_varieties\": [\"Sauvignon Blanc\"],\n  \"region\": {\n    \"en\": \"Loire Valley, France\",\n    \"fr\": \"Vallée de la Loire, France\"\n  },\n  \"appellation\": {\n    \"en\": \"Sancerre AOC\",\n    \"fr\": \"Sancerre AOC\"\n  },\n  \"abv\": 13.0,\n  \"residual_sugar\": 2.5,\n  \"serve_temperature_c\": {\n    \"min\": 8,\n    \"max\": 11\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.5,\n      \"sour\": 6.5,\n      \"salty\": 1.0,\n      \"bitter\": 1.5,\n      \"umami\": 0.5\n    },\n    \"structural\": {\n      \"body\": 4.0,\n      \"acidity_perception\": 7.5,\n      \"tannin\": 0.5,\n      \"astringency\": 1.0,\n      \"alcohol_heat\": 3.5,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"grapefruit\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"stone fruit\",\n        \"item\": \"white peach\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green herbs\",\n        \"item\": \"cut grass\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green herbs\",\n        \"item\": \"nettle\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"green vegetables\",\n        \"item\": \"green pepper\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"flint\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"chalk\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"white flowers\",\n        \"item\": \"elderflower\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.0\n      }\n    ],\n    \"texture\": [\"crisp\", \"clean\", \"light-bodied\", \"refreshing\"],\n    \"finish_length\": 6.5,\n    \"complexity\": 6.0\n  },\n  \"notes\": {\n    \"en\": \"Sancerre is a benchmark expression of Sauvignon Blanc, showcasing the terroir of the Loire Valley through its signature flinty minerality, bright citrus, and herbaceous notes. Best enjoyed young and fresh, typically within 3-5 years of vintage.\"",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "['en', 'fr'] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:beverage:sancerre-generic",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              },
              "type": {
                "predicted": null,
                "expected": "wine",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 3089,
            "output_tokens": 1024,
            "latency_ms": 11083.4,
            "usd": 0.024627,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:sancerre-generic\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"fr\"],\n  \"name\": {\n    \"en\": \"Sancerre\",\n    \"fr\": \"Sancerre\"\n  },\n  \"description\": {\n    \"en\": \"A classic Loire Valley white wine made from Sauvignon Blanc, known for its vibrant acidity, mineral character, and crisp herbal and citrus aromatics.\",\n    \"fr\": \"Un vin blanc classique de la Loire, issu du Sauvignon Blanc, réputé pour sa vivacité, son caractère minéral et ses arômes herbacés et citronnés.\"\n  },\n  \"type\": \"wine\",\n  \"subtype\": \"white\",\n  \"grape_varieties\": [\"Sauvignon Blanc\"],\n  \"region\": {\n    \"en\": \"Loire Valley, France\",\n    \"fr\": \"Val de Loire, France\"\n  },\n  \"appellation\": {\n    \"en\": \"Sancerre AOC\",\n    \"fr\": \"Sancerre AOC\"\n  },\n  \"abv\": 13.0,\n  \"residual_sugar\": 2.5,\n  \"serve_temperature_c\": {\n    \"min\": 8,\n    \"max\": 11\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.5,\n      \"sour\": 7.0,\n      \"salty\": 1.0,\n      \"bitter\": 1.5,\n      \"umami\": 0.5\n    },\n    \"structural\": {\n      \"body\": 4.5,\n      \"acidity_perception\": 7.5,\n      \"tannin\": 0.5,\n      \"astringency\": 1.0,\n      \"alcohol_heat\": 3.5,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"grapefruit\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lime\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"stone fruit\",\n        \"item\": \"white peach\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green herb\",\n        \"item\": \"cut grass\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green herb\",\n        \"item\": \"nettle\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"green\",\n        \"item\": \"green bell pepper\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 3.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"flint\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"chalk\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"white flower\",\n        \"item\": \"elderflower\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.0\n      }\n    ],\n    \"texture\": [\"crisp\", \"lean\", \"refreshing\", \"clean\"],\n    \"finish_length\": 6.5,\n    \"complexity\": 6.5\n  },\n  \"notes\": {\n    \"en\": \"Best enjoyed young (within 3-5 years of vintage), though top crus can age gracefully. Excellent with goat",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "['en', 'fr'] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:beverage:sancerre-generic",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              },
              "type": {
                "predicted": null,
                "expected": "wine",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 3089,
            "output_tokens": 1024,
            "latency_ms": 11135.8,
            "usd": 0.024627,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:sancerre-generic\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"fr\"],\n  \"name\": {\n    \"en\": \"Sancerre\",\n    \"fr\": \"Sancerre\"\n  },\n  \"description\": {\n    \"en\": \"A classic white wine from the Sancerre appellation in the Loire Valley, France, made exclusively from Sauvignon Blanc. Known for its vivid acidity, mineral character, and aromatic expressiveness.\"\n  },\n  \"type\": \"wine\",\n  \"subtype\": \"white\",\n  \"grape_varieties\": [\"Sauvignon Blanc\"],\n  \"region\": {\n    \"en\": \"Loire Valley\",\n    \"fr\": \"Vallée de la Loire\"\n  },\n  \"appellation\": {\n    \"en\": \"Sancerre AOC\",\n    \"fr\": \"Sancerre AOC\"\n  },\n  \"producer\": {\n    \"en\": \"Various\"\n  },\n  \"abv\": 13.0,\n  \"residual_sugar\": 2.5,\n  \"serve_temperature_c\": {\n    \"min\": 8,\n    \"max\": 11\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.5,\n      \"sour\": 7.5,\n      \"salty\": 1.0,\n      \"bitter\": 1.5,\n      \"umami\": 0.5\n    },\n    \"structural\": {\n      \"body\": 4.5,\n      \"acidity_perception\": 8.0,\n      \"tannin\": 0.5,\n      \"astringency\": 1.0,\n      \"alcohol_heat\": 4.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"grapefruit\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"stone fruit\",\n        \"item\": \"white peach\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green herbs\",\n        \"item\": \"cut grass\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green herbs\",\n        \"item\": \"nettle\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"white flowers\",\n        \"item\": \"elderflower\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"flint\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"green vegetal\",\n        \"item\": \"green bell pepper\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 3.5\n      }\n    ],\n    \"texture\": [\"crisp\", \"lean\", \"refreshing\", \"saline\"],\n    \"finish_length\": 7.0,\n    \"complexity\": 6.5\n  },\n  \"notes\": {\n    \"en\": \"Sancerre is celebrated for its piercing acidity, stony minerality, and vibrant citrus and herbaceous aromatics. Flint and chalk from the Kimmeridgian soils impart a distinctive mineral salinity. Best enjoyed young and fresh, though top examples can age gracefully for several years.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "['en', 'fr'] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:beverage:sancerre-generic",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              },
              "type": {
                "predicted": null,
                "expected": "wine",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 3089,
            "output_tokens": 1024,
            "latency_ms": 11509.7,
            "usd": 0.024627,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3089.0,
        "output_tokens_mean": 1024.0,
        "latency_ms_mean": 11242.967,
        "usd_sum": 0.073881
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-003",
      "category": "structured_output",
      "capability": "schema_valid_pairing",
      "difficulty": "expert",
      "cuisine_region": "french",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.pairing",
        "vocab.match_types",
        "vocab.explanation_codes",
        "vocab.grade_scale",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "2740dc70d150e12a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS pairing schema (`schema/pairing.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/pairing.schema.json\",\n  \"title\": \"OGS Pairing\",\n  \"description\": \"A scored, explained pairing between a dish and a beverage.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"dish_ref\",\n    \"beverage_ref\",\n    \"match_types\",\n    \"overall_score\",\n    \"grade\",\n    \"explanation_codes\",\n    \"confidence\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:pairing:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"dish_ref\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:dish:[a-z0-9-]+$\"\n    },\n    \"beverage_ref\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"match_types\": {\n      \"type\": \"array\",\n      \"minItems\": 1,\n      \"items\": {\n        \"$ref\": \"#/$defs/match_type_entry\"\n      }\n    },\n    \"overall_score\": {\n      \"type\": \"integer\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"grade\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"A+\",\n        \"A\",\n        \"B+\",\n        \"B\",\n        \"C+\",\n        \"C\",\n        \"D\",\n        \"F\"\n      ]\n    },\n    \"explanation_codes\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/explanation_code_entry\"\n      }\n    },\n    \"risks\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/risk_entry\"\n      }\n    },\n    \"confidence\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 1\n    },\n    \"evidence\": {\n      \"$ref\": \"#/$defs/evidence\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"match_type_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"type\",\n        \"score\"\n      ],\n      \"properties\": {\n        \"type\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"harmony\",\n            \"contrast\",\n            \"bridge\",\n            \"intensity_alignment\",\n            \"structural_balance\",\n            \"refresh\"\n          ]\n        },\n        \"score\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 100\n        },\n        \"description\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"primary\": {\n          \"type\": \"boolean\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"explanation_code_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"code\"\n      ],\n      \"properties\": {\n        \"code\": {\n          \"type\": \"string\"\n        },\n        \"weight\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 1\n        },\n        \"detail\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"risk_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"code\",\n        \"severity\"\n      ],\n      \"properties\": {\n        \"code\": {\n          \"type\": \"string\"\n        },\n        \"severity\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"low\",\n            \"medium\",\n            \"high\"\n          ]\n        },\n        \"description\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"mitigation\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"evidence\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"method\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"expert_panel\",\n            \"instrumental\",\n            \"computed\",\n            \"estimated\"\n          ]\n        },\n        \"sources\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n### OGS Explanation Codes\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce an OGS pairing JSON document for the described match. It MUST validate against the OGS pairing schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"pairing_slug\": \"brie-champagne\",\n  \"dish_ref\": \"ogs:core:dish:brie-on-baguette\",\n  \"beverage_ref\": \"ogs:core:beverage:brut-champagne-nv\",\n  \"intent\": \"A classic refresh pairing driven by effervescence and acidity against creamy brie.\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/pairing.schema.json",
        "required_fields": {
          "id": "ogs:core:pairing:brie-champagne",
          "ogs_version": "0.1.0",
          "dish_ref": "ogs:core:dish:brie-on-baguette",
          "beverage_ref": "ogs:core:beverage:brut-champagne-nv"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:brie-champagne\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Brie on Baguette with Brut Champagne NV\",\n  \"dish_ref\": \"ogs:core:dish:brie-on-baguette\",\n  \"beverage_ref\": \"ogs:core:beverage:brut-champagne-nv\",\n  \"match_types\": [\n    {\n      \"type\": \"refresh\",\n      \"score\": 90,\n      \"primary\": true,\n      \"description\": \"Effervescence and acidity of Brut Champagne cut through the creamy richness of brie, cleansing the palate between bites.\"\n    },\n    {\n      \"type\": \"contrast\",\n      \"score\": 82,\n      \"primary\": false,\n      \"description\": \"The lean, crisp character of the Champagne contrasts with the rich, fatty creaminess of the brie.\"\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 85,\n      \"primary\": false,\n      \"description\": \"The body and intensity of the pairing are well-aligned, with neither element overwhelming the other.\"\n    }\n  ],\n  \"overall_score\": 88,\n  \"grade\": \"B+\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"REFRESH_EFFERVESCE\",\n      \"weight\": 0.45,\n      \"detail\": \"Carbonation in the Brut Champagne physically scrubs the palate of brie's fat between bites.\"\n    },\n    {\n      \"code\": \"REFRESH_ACID\",\n      \"weight\": 0.35,\n      \"detail\": \"High acidity of the Champagne cuts through the creamy richness of the brie, maintaining freshness throughout the pairing.\"\n    },\n    {\n      \"code\": \"CONTRAST_RICH_CRISP\",\n      \"weight\": 0.10,\n      \"detail\": \"The rich, heavy brie is contrasted by the crisp, lean Champagne, creating a pleasant dynamic tension.\"\n    },\n    {\n      \"code\": \"STRUCT_BODY_ALIGN\",\n      \"weight\": 0.10,\n      \"detail\": \"The medium body of Brut Champagne aligns well with the soft, creamy weight of brie on baguette.\"\n    }\n  ],\n  \"confidence\": 0.92,\n  \"evidence\": {\n    \"method\": \"expert_panel\",\n    \"notes\": \"A classic and well-established pairing recognized across French gastronomy and sommelier literature.\"\n  },\n  \"description\": \"A classic refresh pairing driven by effervescence and acidity against creamy brie. The Brut Champagne NV delivers persistent bubbles and vibrant acidity that cleanse the palate of brie's lush, buttery fat, making each bite feel as fresh as the first.\",\n  \"notes\": \"Serve Champagne well chilled at 8-10°C. Allow brie to reach room temperature for optimal creaminess and flavor expression.\",\n  \"metadata\": {\n    \"source\": \"OGS Core Pairing Library\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:pairing:brie-champagne",
                "expected": "ogs:core:pairing:brie-champagne",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:brie-on-baguette",
                "expected": "ogs:core:dish:brie-on-baguette",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:brut-champagne-nv",
                "expected": "ogs:core:beverage:brut-champagne-nv",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 4200,
            "output_tokens": 797,
            "latency_ms": 12347.8,
            "usd": 0.024555,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:brie-champagne\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Brie on Baguette with Brut Champagne NV\",\n  \"dish_ref\": \"ogs:core:dish:brie-on-baguette\",\n  \"beverage_ref\": \"ogs:core:beverage:brut-champagne-nv\",\n  \"match_types\": [\n    {\n      \"type\": \"refresh\",\n      \"score\": 92,\n      \"primary\": true,\n      \"description\": \"Effervescence and high acidity of Brut Champagne cut through the rich, creamy texture of brie, cleansing the palate with each sip.\"\n    },\n    {\n      \"type\": \"contrast\",\n      \"score\": 85,\n      \"primary\": false,\n      \"description\": \"The lean, crisp character of the Champagne contrasts beautifully with the rich, fatty creaminess of the brie.\"\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 88,\n      \"primary\": false,\n      \"description\": \"The body and intensity of the Champagne aligns well with the mild but rich brie, neither overwhelming the other.\"\n    }\n  ],\n  \"overall_score\": 91,\n  \"grade\": \"A\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"REFRESH_EFFERVESCE\",\n      \"weight\": 0.40,\n      \"detail\": \"The persistent fine bubbles of Brut Champagne physically scrub the palate of rich, fatty brie residue between bites.\"\n    },\n    {\n      \"code\": \"REFRESH_ACID\",\n      \"weight\": 0.30,\n      \"detail\": \"High acidity in the Champagne refreshes the palate and cuts through the fat content of the creamy brie.\"\n    },\n    {\n      \"code\": \"CONTRAST_RICH_CRISP\",\n      \"weight\": 0.15,\n      \"detail\": \"The rich, buttery brie is beautifully contrasted by the crisp, lean profile of the Brut Champagne.\"\n    },\n    {\n      \"code\": \"STRUCT_ACIDITY_BALANCE\",\n      \"weight\": 0.10,\n      \"detail\": \"The beverage's acidity balances the richness of the brie, preventing the pairing from feeling heavy.\"\n    },\n    {\n      \"code\": \"HARMONY_REGIONAL\",\n      \"weight\": 0.05,\n      \"detail\": \"Both brie and Champagne share French origins, reflecting a classic regional affinity.\"\n    }\n  ],\n  \"risks\": [\n    {\n      \"code\": \"RISK_INTENSITY_MISMATCH\",\n      \"severity\": \"low\",\n      \"description\": \"A very aged or strongly flavored brie could potentially overpower the more delicate Champagne aromas.\",\n      \"mitigation\": \"Choose a young, mild brie to ensure the Champagne's subtle fruit and toasty notes are not overwhelmed.\"\n    }\n  ],\n  \"confidence\": 0.92,\n  \"evidence\": {\n    \"method\": \"expert_panel\",\n    \"notes\": \"Classic French pairing with extensive culinary tradition supporting the match.\"\n  },\n  \"description\": \"A quintessentially French pairing in which the effervescence and bright acidity of a Brut Champagne NV provide a refreshing counterpoint to the lush, creamy richness of brie served on a crisp baguette. The bubbles cleanse the palate, while the wine's citrus and toasty notes complement the mild, buttery character of the cheese.\",\n  \"metadata\": {\n    \"source\": \"OGS Core Pairing Library\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:pairing:brie-champagne",
                "expected": "ogs:core:pairing:brie-champagne",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:brie-on-baguette",
                "expected": "ogs:core:dish:brie-on-baguette",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:brut-champagne-nv",
                "expected": "ogs:core:beverage:brut-champagne-nv",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 4200,
            "output_tokens": 941,
            "latency_ms": 17547.2,
            "usd": 0.026715,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:brie-champagne\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Brie on Baguette with Brut Champagne NV\",\n  \"dish_ref\": \"ogs:core:dish:brie-on-baguette\",\n  \"beverage_ref\": \"ogs:core:beverage:brut-champagne-nv\",\n  \"match_types\": [\n    {\n      \"type\": \"refresh\",\n      \"score\": 90,\n      \"primary\": true,\n      \"description\": \"Effervescence and high acidity of Brut Champagne cleanse the palate between bites of creamy brie.\"\n    },\n    {\n      \"type\": \"contrast\",\n      \"score\": 82,\n      \"primary\": false,\n      \"description\": \"The crisp, lean character of the Champagne contrasts beautifully with the rich, fatty creaminess of brie.\"\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 85,\n      \"primary\": false,\n      \"description\": \"The acidity of the Champagne cuts through the fat of the brie, maintaining a fresh and balanced mouthfeel.\"\n    }\n  ],\n  \"overall_score\": 88,\n  \"grade\": \"B+\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"REFRESH_EFFERVESCE\",\n      \"weight\": 0.40,\n      \"detail\": \"Champagne bubbles physically scrub the rich, creamy residue of brie from the palate.\"\n    },\n    {\n      \"code\": \"REFRESH_ACID\",\n      \"weight\": 0.30,\n      \"detail\": \"Brut Champagne's high acidity cuts through the fat of the brie, refreshing the palate with each sip.\"\n    },\n    {\n      \"code\": \"CONTRAST_RICH_CRISP\",\n      \"weight\": 0.20,\n      \"detail\": \"The lean, crisp profile of the Champagne provides a lively contrast to the dense, creamy texture of brie.\"\n    },\n    {\n      \"code\": \"STRUCT_ACIDITY_BALANCE\",\n      \"weight\": 0.10,\n      \"detail\": \"Acidity in the Champagne balances the richness of the brie and the neutral starch of the baguette.\"\n    }\n  ],\n  \"confidence\": 0.91,\n  \"description\": \"A classic French pairing where Brut Champagne's effervescence and sharp acidity act as a palate cleanser against the lush, creamy softness of brie served on a baguette slice. Each sip resets the palate for the next bite.\",\n  \"evidence\": {\n    \"method\": \"expert_panel\",\n    \"notes\": \"Widely regarded as a canonical pairing in French gastronomy and sommelier education.\"\n  },\n  \"metadata\": {\n    \"created\": \"2025-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Pairings Library\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:pairing:brie-champagne",
                "expected": "ogs:core:pairing:brie-champagne",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:brie-on-baguette",
                "expected": "ogs:core:dish:brie-on-baguette",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:brut-champagne-nv",
                "expected": "ogs:core:beverage:brut-champagne-nv",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 4200,
            "output_tokens": 779,
            "latency_ms": 11204.3,
            "usd": 0.024285,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 4200.0,
        "output_tokens_mean": 839.0,
        "latency_ms_mean": 13699.767,
        "usd_sum": 0.075555
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-004",
      "category": "structured_output",
      "capability": "sensory_profile_json",
      "difficulty": "medium",
      "cuisine_region": "indian",
      "scorer": "json_match",
      "context_needs": [
        "scales.basic_tastes",
        "scales.structural",
        "vocab.aromatic_families",
        "vocab.qualifiers"
      ],
      "prompt_template_hash": "9bd6860a8c5c69ec",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Aromatic qualifiers\n\nfresh, dried, cooked, candied, fermented, zest, overripe\n\n---\n\nProduce a JSON object representing the sensory profile for ripe mango. The object must have keys basic_tastes (with sweet, sour, salty, bitter, umami), structural (with body, acidity_perception, tannin, astringency, alcohol_heat, effervescence), and aromatics (array of entries with family, group, item, intensity). All intensities on 0-10. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"item\": \"ripe Alphonso mango, peak ripeness\"\n}\n\nANSWER FORMAT: json\n\nRESPONSE JSON SCHEMA:\n{\n  \"type\": \"object\",\n  \"required\": [\n    \"basic_tastes\",\n    \"structural\",\n    \"aromatics\"\n  ],\n  \"properties\": {\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"sweet\",\n        \"sour\",\n        \"salty\",\n        \"bitter\",\n        \"umami\"\n      ]\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"body\",\n        \"acidity_perception\",\n        \"tannin\",\n        \"astringency\",\n        \"alcohol_heat\",\n        \"effervescence\"\n      ]\n    },\n    \"aromatics\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"type\": \"object\",\n        \"required\": [\n          \"family\",\n          \"group\",\n          \"item\",\n          \"intensity\"\n        ]\n      }\n    }\n  }\n}",
      "expected": {
        "value": {
          "basic_tastes": {
            "sweet": 8.5,
            "sour": 3.0,
            "salty": 0,
            "bitter": 0.5,
            "umami": 0.5
          },
          "structural": {
            "body": 5.5,
            "acidity_perception": 3.0,
            "tannin": 0,
            "astringency": 0.5,
            "alcohol_heat": 0,
            "effervescence": 0
          }
        },
        "required_keys": [
          "basic_tastes.sweet",
          "basic_tastes.sour",
          "structural.body",
          "structural.acidity_perception",
          "structural.tannin"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 8.5,\n    \"sour\": 3.0,\n    \"salty\": 0.5,\n    \"bitter\": 0.5,\n    \"umami\": 1.0\n  },\n  \"structural\": {\n    \"body\": 7.0,\n    \"acidity_perception\": 3.0,\n    \"tannin\": 0.5,\n    \"astringency\": 0.5,\n    \"alcohol_heat\": 0.0,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": [\n    {\n      \"family\": \"fruit\",\n      \"group\": \"tropical\",\n      \"item\": \"mango\",\n      \"intensity\": 9.5\n    },\n    {\n      \"family\": \"fruit\",\n      \"group\": \"tropical\",\n      \"item\": \"peach\",\n      \"intensity\": 6.0\n    },\n    {\n      \"family\": \"fruit\",\n      \"group\": \"tropical\",\n      \"item\": \"apricot\",\n      \"intensity\": 5.5\n    },\n    {\n      \"family\": \"fruit\",\n      \"qualifier\": \"overripe\",\n      \"group\": \"tropical\",\n      \"item\": \"banana\",\n      \"intensity\": 4.0\n    },\n    {\n      \"family\": \"confection\",\n      \"group\": \"sweet\",\n      \"item\": \"honey\",\n      \"intensity\": 5.0\n    },\n    {\n      \"family\": \"floral\",\n      \"group\": \"fresh\",\n      \"item\": \"orange blossom\",\n      \"intensity\": 4.5\n    },\n    {\n      \"family\": \"spice\",\n      \"group\": \"warm\",\n      \"item\": \"saffron\",\n      \"intensity\": 3.5\n    },\n    {\n      \"family\": \"herbal\",\n      \"group\": \"fresh\",\n      \"item\": \"fresh herb\",\n      \"intensity\": 2.0\n    },\n    {\n      \"family\": \"fruit\",\n      \"qualifier\": \"zest\",\n      \"group\": \"citrus\",\n      \"item\": \"lime zest\",\n      \"intensity\": 3.0\n    }\n  ]\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.sweet": {
                "predicted": 8.5,
                "expected": 8.5,
                "match": true
              },
              "basic_tastes.sour": {
                "predicted": 3.0,
                "expected": 3.0,
                "match": true
              },
              "structural.body": {
                "predicted": 7.0,
                "expected": 5.5,
                "match": true
              },
              "structural.acidity_perception": {
                "predicted": 3.0,
                "expected": 3.0,
                "match": true
              },
              "structural.tannin": {
                "predicted": 0.5,
                "expected": 0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 830,
            "output_tokens": 546,
            "latency_ms": 6598.9,
            "usd": 0.01068,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 9.0,\n    \"sour\": 2.5,\n    \"salty\": 0.2,\n    \"bitter\": 0.5,\n    \"umami\": 0.5\n  },\n  \"structural\": {\n    \"body\": 7.5,\n    \"acidity_perception\": 2.5,\n    \"tannin\": 0.5,\n    \"astringency\": 0.5,\n    \"alcohol_heat\": 0.0,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": [\n    {\n      \"family\": \"fruit\",\n      \"group\": \"tropical\",\n      \"item\": \"mango\",\n      \"intensity\": 9.5\n    },\n    {\n      \"family\": \"fruit\",\n      \"group\": \"tropical\",\n      \"item\": \"peach\",\n      \"intensity\": 6.0\n    },\n    {\n      \"family\": \"fruit\",\n      \"group\": \"tropical\",\n      \"item\": \"apricot\",\n      \"intensity\": 5.5\n    },\n    {\n      \"family\": \"fruit\",\n      \"group\": \"overripe\",\n      \"item\": \"overripe banana\",\n      \"intensity\": 4.0\n    },\n    {\n      \"family\": \"floral\",\n      \"group\": \"fresh\",\n      \"item\": \"jasmine\",\n      \"intensity\": 4.5\n    },\n    {\n      \"family\": \"confection\",\n      \"group\": \"candied\",\n      \"item\": \"honey\",\n      \"intensity\": 5.5\n    },\n    {\n      \"family\": \"spice\",\n      \"group\": \"fresh\",\n      \"item\": \"saffron\",\n      \"intensity\": 3.0\n    },\n    {\n      \"family\": \"vegetal\",\n      \"group\": \"fresh\",\n      \"item\": \"green leaf\",\n      \"intensity\": 1.5\n    },\n    {\n      \"family\": \"fruit\",\n      \"group\": \"zest\",\n      \"item\": \"citrus zest\",\n      \"intensity\": 3.0\n    }\n  ]\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.sweet": {
                "predicted": 9.0,
                "expected": 8.5,
                "match": true
              },
              "basic_tastes.sour": {
                "predicted": 2.5,
                "expected": 3.0,
                "match": true
              },
              "structural.body": {
                "predicted": 7.5,
                "expected": 5.5,
                "match": true
              },
              "structural.acidity_perception": {
                "predicted": 2.5,
                "expected": 3.0,
                "match": true
              },
              "structural.tannin": {
                "predicted": 0.5,
                "expected": 0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 830,
            "output_tokens": 532,
            "latency_ms": 6392.4,
            "usd": 0.01047,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 9.0,\n    \"sour\": 2.5,\n    \"salty\": 0.5,\n    \"bitter\": 0.5,\n    \"umami\": 1.0\n  },\n  \"structural\": {\n    \"body\": 7.5,\n    \"acidity_perception\": 2.5,\n    \"tannin\": 0.5,\n    \"astringency\": 0.5,\n    \"alcohol_heat\": 0.0,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": [\n    {\n      \"family\": \"fruit\",\n      \"group\": \"tropical\",\n      \"item\": \"mango\",\n      \"intensity\": 9.5\n    },\n    {\n      \"family\": \"fruit\",\n      \"group\": \"stone fruit\",\n      \"item\": \"peach\",\n      \"intensity\": 6.0\n    },\n    {\n      \"family\": \"fruit\",\n      \"group\": \"citrus\",\n      \"item\": \"apricot\",\n      \"intensity\": 5.0\n    },\n    {\n      \"family\": \"confection\",\n      \"group\": \"sweet\",\n      \"item\": \"honey\",\n      \"intensity\": 5.5\n    },\n    {\n      \"family\": \"floral\",\n      \"group\": \"light floral\",\n      \"item\": \"jasmine\",\n      \"intensity\": 4.0\n    },\n    {\n      \"family\": \"spice\",\n      \"group\": \"warm spice\",\n      \"item\": \"saffron\",\n      \"intensity\": 3.0\n    },\n    {\n      \"family\": \"vegetal\",\n      \"group\": \"fresh green\",\n      \"item\": \"fresh leaves\",\n      \"intensity\": 1.5\n    },\n    {\n      \"family\": \"fruit\",\n      \"group\": \"overripe\",\n      \"item\": \"overripe tropical\",\n      \"intensity\": 3.5\n    }\n  ]\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.sweet": {
                "predicted": 9.0,
                "expected": 8.5,
                "match": true
              },
              "basic_tastes.sour": {
                "predicted": 2.5,
                "expected": 3.0,
                "match": true
              },
              "structural.body": {
                "predicted": 7.5,
                "expected": 5.5,
                "match": true
              },
              "structural.acidity_perception": {
                "predicted": 2.5,
                "expected": 3.0,
                "match": true
              },
              "structural.tannin": {
                "predicted": 0.5,
                "expected": 0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 830,
            "output_tokens": 495,
            "latency_ms": 6468.5,
            "usd": 0.009915,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 830.0,
        "output_tokens_mean": 524.333,
        "latency_ms_mean": 6486.6,
        "usd_sum": 0.031065
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-005",
      "category": "structured_output",
      "capability": "schema_valid_dish",
      "difficulty": "expert",
      "cuisine_region": "french",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.dish",
        "vocab.component_roles",
        "vocab.cooking_methods",
        "vocab.courses",
        "vocab.service_temperatures",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "9caf9bc474b2464d",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS dish schema (`schema/dish.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/dish.schema.json\",\n  \"title\": \"OGS Dish\",\n  \"description\": \"A composed dish with components, cooking methods, and overall sensory profile.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"components\",\n    \"overall_sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:dish:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"cuisine\": {\n      \"type\": \"string\",\n      \"description\": \"Free-text cuisine name (v0.1) OR a cuisine entity reference matching ogs:<ns>:cuisine:<id> (v0.2+).\"\n    },\n    \"course\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"amuse\",\n        \"appetizer\",\n        \"soup\",\n        \"salad\",\n        \"fish\",\n        \"main\",\n        \"cheese\",\n        \"dessert\",\n        \"petit_four\"\n      ]\n    },\n    \"service_temperature\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"frozen\",\n        \"cold\",\n        \"cool\",\n        \"room\",\n        \"warm\",\n        \"hot\",\n        \"very_hot\"\n      ]\n    },\n    \"components\": {\n      \"type\": \"array\",\n      \"minItems\": 1,\n      \"items\": {\n        \"$ref\": \"#/$defs/component\"\n      }\n    },\n    \"overall_sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"component\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"role\",\n        \"ingredient_ref\"\n      ],\n      \"properties\": {\n        \"role\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"main_protein\",\n            \"main_vegetable\",\n            \"starch\",\n            \"sauce\",\n            \"fat_component\",\n            \"acid_component\",\n            \"aromatic_garnish\",\n            \"seasoning\",\n            \"textural_element\"\n          ]\n        },\n        \"ingredient_ref\": {\n          \"type\": \"string\",\n          \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\"\n        },\n        \"cooking_method\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"raw\",\n            \"seared\",\n            \"grilled\",\n            \"roasted\",\n            \"braised\",\n            \"poached\",\n            \"steamed\",\n            \"fried\",\n            \"confit\",\n            \"smoked\",\n            \"fermented\",\n            \"cured\",\n            \"pickled\",\n            \"dehydrated\",\n            \"sous_vide\",\n            \"blanched\",\n            \"sauteed\",\n            \"baked\",\n            \"charred\",\n            \"emulsified\"\n          ]\n        },\n        \"maillard_index\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 5\n        },\n        \"reduction_level\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 5\n        },\n        \"proportion\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"dominant\",\n            \"significant\",\n            \"accent\",\n            \"trace\"\n          ]\n        },\n        \"sensory_contribution\": {\n          \"type\": \"object\",\n          \"properties\": {\n            \"basic_tastes\": {\n              \"$ref\": \"#/$defs/basic_tastes\"\n            },\n            \"aromatics\": {\n              \"type\": \"array\",\n              \"items\": {\n                \"$ref\": \"#/$defs/aromatic_entry\"\n              }\n            },\n            \"texture\": {\n              \"type\": \"array\",\n              \"items\": {\n                \"type\": \"string\"\n              }\n            }\n          },\n          \"additionalProperties\": true\n        },\n        \"notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Service temperatures\n\nfrozen, cold, cool, room, warm, hot, very_hot\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce an OGS dish JSON document for 'moules marinieres' (mussels steamed with white wine, shallots, parsley, butter). It must validate against the OGS dish schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"dish_slug\": \"moules-marinieres\",\n  \"cuisine\": \"french\",\n  \"course\": \"main\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/dish.schema.json",
        "required_fields": {
          "id": "ogs:core:dish:moules-marinieres",
          "ogs_version": "0.1.0"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:moules-marinieres\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"fr\",\n  \"available_languages\": [\"fr\", \"en\"],\n  \"name\": {\n    \"fr\": \"Moules Marinières\",\n    \"en\": \"Mussels in White Wine Sauce\"\n  },\n  \"description\": {\n    \"fr\": \"Moules cuites à la vapeur avec du vin blanc, des échalotes, du persil et du beurre.\",\n    \"en\": \"Mussels steamed with white wine, shallots, parsley, and butter.\"\n  },\n  \"cuisine\": \"french\",\n  \"course\": \"main\",\n  \"service_temperature\": \"hot\",\n  \"components\": [\n    {\n      \"role\": \"main_protein\",\n      \"ingredient_ref\": \"ogs:core:ingredient:mussels\",\n      \"cooking_method\": \"steamed\",\n      \"maillard_index\": 0,\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"salty\": 5.5,\n          \"umami\": 7.0,\n          \"sweet\": 2.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"marine\",\n            \"group\": \"shellfish\",\n            \"item\": \"mussel\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 7.5\n          }\n        ],\n        \"texture\": [\"tender\", \"plump\", \"juicy\"]\n      }\n    },\n    {\n      \"role\": \"acid_component\",\n      \"ingredient_ref\": \"ogs:core:ingredient:dry-white-wine\",\n      \"cooking_method\": \"sauteed\",\n      \"reduction_level\": 2,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sour\": 4.0,\n          \"sweet\": 2.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"fruit\",\n            \"group\": \"citrus\",\n            \"item\": \"lemon\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 3.0\n          },\n          {\n            \"family\": \"fruit\",\n            \"group\": \"tree fruit\",\n            \"item\": \"apple\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 3.5\n          }\n        ],\n        \"texture\": [\"liquid\", \"light-bodied\"]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:shallots\",\n      \"cooking_method\": \"sauteed\",\n      \"maillard_index\": 1,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 3.5,\n          \"umami\": 2.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"allium\",\n            \"item\": \"shallot\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 4.5\n          }\n        ],\n        \"texture\": [\"soft\", \"translucent\"]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:flat-leaf-parsley\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"bitter\": 1.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"fresh herb\",\n            \"item\": \"parsley\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 5.0\n          }\n        ],\n        \"texture\": [\"soft\"]\n      }\n    },\n    {\n      \"role\": \"fat_component\",\n      \"ingredient_ref\": \"ogs:core:ingredient:unsalted-butter\",\n      \"cooking_method\": \"emulsified\",\n      \"",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "['fr', 'en'] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:dish:moules-marinieres",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 3752,
            "output_tokens": 1024,
            "latency_ms": 10450.6,
            "usd": 0.026616,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:moules-marinieres\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"fr\",\n  \"available_languages\": [\"fr\", \"en\"],\n  \"name\": {\n    \"fr\": \"Moules Marinières\",\n    \"en\": \"Mussels in White Wine\"\n  },\n  \"description\": {\n    \"fr\": \"Moules cuites à la vapeur avec du vin blanc, des échalotes, du persil et du beurre.\",\n    \"en\": \"Mussels steamed with white wine, shallots, parsley, and butter.\"\n  },\n  \"cuisine\": \"french\",\n  \"course\": \"main\",\n  \"service_temperature\": \"hot\",\n  \"components\": [\n    {\n      \"role\": \"main_protein\",\n      \"ingredient_ref\": \"ogs:core:ingredient:mussels\",\n      \"cooking_method\": \"steamed\",\n      \"maillard_index\": 0,\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"salty\": 5.5,\n          \"umami\": 7.0,\n          \"sweet\": 2.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"marine\",\n            \"group\": \"shellfish\",\n            \"item\": \"mussel\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 7.5\n          }\n        ],\n        \"texture\": [\"tender\", \"plump\", \"juicy\"]\n      }\n    },\n    {\n      \"role\": \"acid_component\",\n      \"ingredient_ref\": \"ogs:core:ingredient:dry-white-wine\",\n      \"cooking_method\": \"sauteed\",\n      \"reduction_level\": 2,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sour\": 4.0,\n          \"sweet\": 1.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"fruit\",\n            \"group\": \"tree fruit\",\n            \"item\": \"apple\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 3.5\n          },\n          {\n            \"family\": \"floral\",\n            \"group\": \"white flower\",\n            \"item\": \"blossom\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 2.5\n          }\n        ],\n        \"texture\": [\"liquid\", \"aromatic broth\"]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:shallot\",\n      \"cooking_method\": \"sauteed\",\n      \"maillard_index\": 1,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 3.5,\n          \"umami\": 2.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"allium\",\n            \"item\": \"shallot\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 4.5\n          }\n        ],\n        \"texture\": [\"soft\", \"translucent\"]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:flat-leaf-parsley\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"bitter\": 1.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"soft herb\",\n            \"item\": \"parsley\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 5.0\n          }\n        ],\n        \"texture\": [\"fresh\", \"leafy\"]\n      }\n    },\n    {\n      \"role\": \"fat_component\",\n      \"ingredient_ref\": \"ogs:core:ingredient:unsalted-butter\",\n      \"cooking_method\": \"emulsified",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "['fr', 'en'] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:dish:moules-marinieres",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 3752,
            "output_tokens": 1024,
            "latency_ms": 10468.7,
            "usd": 0.026616,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:moules-marinieres\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"fr\",\n  \"available_languages\": [\"fr\", \"en\"],\n  \"name\": {\n    \"fr\": \"Moules Marinières\",\n    \"en\": \"Mussels Marinière\"\n  },\n  \"description\": {\n    \"fr\": \"Moules cuites à la vapeur avec du vin blanc, des échalotes, du persil et du beurre.\",\n    \"en\": \"Mussels steamed with white wine, shallots, parsley, and butter.\"\n  },\n  \"cuisine\": \"french\",\n  \"course\": \"main\",\n  \"service_temperature\": \"hot\",\n  \"components\": [\n    {\n      \"role\": \"main_protein\",\n      \"ingredient_ref\": \"ogs:core:ingredient:mussel\",\n      \"cooking_method\": \"steamed\",\n      \"maillard_index\": 0,\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"salty\": 5.5,\n          \"umami\": 7.0,\n          \"sweet\": 3.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"marine\",\n            \"group\": \"shellfish\",\n            \"item\": \"mussel\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 7.5\n          }\n        ],\n        \"texture\": [\"tender\", \"plump\", \"juicy\"]\n      }\n    },\n    {\n      \"role\": \"sauce\",\n      \"ingredient_ref\": \"ogs:core:ingredient:white-wine\",\n      \"cooking_method\": \"steamed\",\n      \"reduction_level\": 2,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sour\": 4.0,\n          \"sweet\": 2.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"fruit\",\n            \"group\": \"tree-fruit\",\n            \"item\": \"apple\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 3.0\n          }\n        ],\n        \"texture\": [\"liquid\", \"light-bodied\"]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:shallot\",\n      \"cooking_method\": \"sauteed\",\n      \"maillard_index\": 1,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 3.5,\n          \"umami\": 2.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"allium\",\n            \"item\": \"shallot\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 4.5\n          }\n        ],\n        \"texture\": [\"soft\", \"tender\"]\n      }\n    },\n    {\n      \"role\": \"fat_component\",\n      \"ingredient_ref\": \"ogs:core:ingredient:butter\",\n      \"cooking_method\": \"emulsified\",\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 2.0,\n          \"umami\": 2.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"dairy\",\n            \"group\": \"butter\",\n            \"item\": \"cultured-butter\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 4.0\n          }\n        ],\n        \"texture\": [\"rich\", \"silky\", \"emulsified\"]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:flat-leaf-parsley\",\n      \"cooking_method\": \"raw\",\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"bitter\": 1.5\n        },\n        \"aromatics\": [",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "['fr', 'en'] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:dish:moules-marinieres",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 3752,
            "output_tokens": 1024,
            "latency_ms": 9614.4,
            "usd": 0.026616,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3752.0,
        "output_tokens_mean": 1024.0,
        "latency_ms_mean": 10177.9,
        "usd_sum": 0.079848
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-006",
      "category": "structured_output",
      "capability": "schema_valid_ingredient",
      "difficulty": "medium",
      "cuisine_region": "iranian",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.ingredient",
        "vocab.ingredient_categories",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "bb4d0fcee2d1e751",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS ingredient schema (`schema/ingredient.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/ingredient.schema.json\",\n  \"title\": \"OGS Ingredient\",\n  \"description\": \"An ingredient entity with sensory profile, category, and provenance metadata.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"category\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\",\n      \"description\": \"OGS identifier in the format ogs:<namespace>:ingredient:<id>\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\",\n      \"description\": \"OGS specification version (0.1.x or 0.2.x)\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\",\n      \"description\": \"Authoritative / endonym language of this entity's human-readable fields.\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      },\n      \"description\": \"Advisory list of languages covered by this entity's LString fields.\"\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"category\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"protein\",\n        \"vegetable\",\n        \"fruit\",\n        \"grain\",\n        \"dairy\",\n        \"fat\",\n        \"herb\",\n        \"spice\",\n        \"fungus\",\n        \"legume\",\n        \"nut\",\n        \"seed\",\n        \"sweetener\",\n        \"acid\",\n        \"salt\",\n        \"other\"\n      ]\n    },\n    \"subcategory\": {\n      \"type\": \"string\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"origin\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"region\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"season\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"terroir_notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document that describes 'saffron' as an OGS ingredient. It MUST validate against the OGS ingredient schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"ingredient_name\": \"saffron\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"saffron-stigma\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/ingredient.schema.json",
        "required_fields": {
          "id": "ogs:core:ingredient:saffron-stigma",
          "ogs_version": "0.1.0",
          "category": "spice"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:saffron-stigma\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\"],\n  \"name\": {\n    \"en\": \"Saffron\"\n  },\n  \"description\": {\n    \"en\": \"Saffron is the dried stigmas of Crocus sativus, the most expensive spice in the world by weight. It imparts a distinctive golden-yellow color, a complex floral and honey-like aroma, and a subtly bitter, earthy taste to dishes.\"\n  },\n  \"category\": \"spice\",\n  \"subcategory\": \"stigma\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.5,\n      \"sour\": 0.5,\n      \"salty\": 0.0,\n      \"bitter\": 4.5,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 3.0,\n      \"acidity_perception\": 1.0,\n      \"tannin\": 1.5,\n      \"astringency\": 2.0,\n      \"alcohol_heat\": 0.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"floral\",\n        \"group\": \"sweet floral\",\n        \"item\": \"honey-floral\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 8.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"exotic spice\",\n        \"item\": \"metallic-medicinal\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"musty\",\n        \"item\": \"hay\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"honey\",\n        \"item\": \"honey\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"aromatic herb\",\n        \"item\": \"grassy\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 3.0\n      }\n    ],\n    \"texture\": [\n      \"powdery\",\n      \"thread-like\",\n      \"slightly sticky when hydrated\"\n    ],\n    \"finish_length\": 7.5,\n    \"complexity\": 8.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"Iran (primary producer), also Spain (La Mancha), Kashmir, Greece, and Morocco\"\n    },\n    \"season\": {\n      \"en\": \"Autumn harvest, typically October to November\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"Best saffron comes from arid or semi-arid climates with well-drained soils and cold winters. Iranian and Spanish varieties are most prized for their intense color and aroma compounds (safranal, picrocrocin, crocin).\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Saffron should be steeped in warm water or broth before use to release its color and aroma compounds. Just a small amount (0.1–0.5g) is sufficient to flavor and color an entire dish. Used extensively in Persian, Moroccan, Spanish (paella), and Italian (risotto Milanese) cuisines.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Ingredient Library\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:saffron-stigma",
                "expected": "ogs:core:ingredient:saffron-stigma",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "spice",
                "expected": "spice",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3065,
            "output_tokens": 960,
            "latency_ms": 14680.5,
            "usd": 0.023595,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:saffron-stigma\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\"],\n  \"name\": {\n    \"en\": \"Saffron\"\n  },\n  \"description\": {\n    \"en\": \"Saffron is the dried stigmas of Crocus sativus, prized as the world's most expensive spice. It imparts a vivid golden-yellow color, a distinctive honeyed floral aroma, and a subtly bitter, earthy flavor to dishes.\"\n  },\n  \"category\": \"spice\",\n  \"subcategory\": \"stigma\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.5,\n      \"sour\": 0.5,\n      \"salty\": 0.0,\n      \"bitter\": 3.5,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 4.0,\n      \"acidity_perception\": 0.5,\n      \"tannin\": 1.0,\n      \"astringency\": 1.5,\n      \"alcohol_heat\": 0.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"floral\",\n        \"group\": \"honeyed floral\",\n        \"item\": \"honey blossom\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 8.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"exotic spice\",\n        \"item\": \"saffron\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 9.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"hay\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"honey\",\n        \"item\": \"beeswax\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"medicinal herb\",\n        \"item\": \"chamomile\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 3.5\n      }\n    ],\n    \"texture\": [\n      \"powdery\",\n      \"thread-like\",\n      \"slightly waxy when hydrated\"\n    ],\n    \"finish_length\": 7.5,\n    \"complexity\": 8.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"Iran (primary), Spain (La Mancha), Kashmir, Greece\"\n    },\n    \"season\": {\n      \"en\": \"Autumn harvest (October–November)\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"Climate, soil composition, and altitude significantly influence the intensity of safranal and crocin compounds, affecting color strength (measured in ISO units) and aromatic depth.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Saffron should be steeped in warm liquid before use to release its color and aroma compounds. A small amount (0.1–0.3 g) is sufficient to flavor an entire dish. It is central to dishes such as paella, risotto Milanese, bouillabaisse, and Persian rice.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Vocabulary\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:saffron-stigma",
                "expected": "ogs:core:ingredient:saffron-stigma",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "spice",
                "expected": "spice",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3065,
            "output_tokens": 920,
            "latency_ms": 14009.9,
            "usd": 0.022995,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:saffron-stigma\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\"],\n  \"name\": {\n    \"en\": \"Saffron\"\n  },\n  \"description\": {\n    \"en\": \"Saffron is the dried stigmas of Crocus sativus, one of the world's most prized and expensive spices, prized for its vivid golden color, distinctive floral-honey aroma, and slightly bitter, earthy flavor.\"\n  },\n  \"category\": \"spice\",\n  \"subcategory\": \"stigma\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.5,\n      \"sour\": 0.5,\n      \"salty\": 0,\n      \"bitter\": 3.5,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 4.0,\n      \"acidity_perception\": 1.0,\n      \"tannin\": 1.5,\n      \"astringency\": 2.0,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"floral\",\n        \"group\": \"exotic floral\",\n        \"item\": \"honey-floral\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 8.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"hay\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"metallic-spice\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"honey\",\n        \"item\": \"honey\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"grassy\",\n        \"item\": \"dried grass\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 3.5\n      }\n    ],\n    \"texture\": [\n      \"delicate\",\n      \"thread-like\",\n      \"slightly waxy\"\n    ],\n    \"finish_length\": 7.5,\n    \"complexity\": 8.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"Iran, Kashmir, Spain, Greece\"\n    },\n    \"season\": {\n      \"en\": \"Autumn (October–November harvest)\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"Grown in semi-arid climates with well-drained soils; Iranian and Kashmiri saffron are considered among the finest grades with highest crocin and safranal content.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Saffron's key aroma compounds include safranal and picrocrocin. It imparts a distinctive golden-yellow color to dishes due to crocin. Used sparingly in paellas, risottos, biryanis, and bouillabaisse.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Ingredient Library\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:saffron-stigma",
                "expected": "ogs:core:ingredient:saffron-stigma",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "spice",
                "expected": "spice",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3065,
            "output_tokens": 899,
            "latency_ms": 13003.4,
            "usd": 0.02268,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3065.0,
        "output_tokens_mean": 926.333,
        "latency_ms_mean": 13897.933,
        "usd_sum": 0.06927
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-007",
      "category": "structured_output",
      "capability": "schema_valid_ingredient",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.ingredient",
        "vocab.ingredient_categories",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "0bc2e49ee6c1093e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS ingredient schema (`schema/ingredient.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/ingredient.schema.json\",\n  \"title\": \"OGS Ingredient\",\n  \"description\": \"An ingredient entity with sensory profile, category, and provenance metadata.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"category\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\",\n      \"description\": \"OGS identifier in the format ogs:<namespace>:ingredient:<id>\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\",\n      \"description\": \"OGS specification version (0.1.x or 0.2.x)\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\",\n      \"description\": \"Authoritative / endonym language of this entity's human-readable fields.\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      },\n      \"description\": \"Advisory list of languages covered by this entity's LString fields.\"\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"category\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"protein\",\n        \"vegetable\",\n        \"fruit\",\n        \"grain\",\n        \"dairy\",\n        \"fat\",\n        \"herb\",\n        \"spice\",\n        \"fungus\",\n        \"legume\",\n        \"nut\",\n        \"seed\",\n        \"sweetener\",\n        \"acid\",\n        \"salt\",\n        \"other\"\n      ]\n    },\n    \"subcategory\": {\n      \"type\": \"string\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"origin\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"region\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"season\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"terroir_notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document that describes 'miso (red)' as an OGS ingredient. It MUST validate against the OGS ingredient schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"ingredient_name\": \"miso (red)\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"aka-miso\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/ingredient.schema.json",
        "required_fields": {
          "id": "ogs:core:ingredient:aka-miso",
          "ogs_version": "0.1.0",
          "category": "other"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:aka-miso\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"ja\"],\n  \"name\": {\n    \"en\": \"miso (red)\",\n    \"ja\": \"赤味噌\"\n  },\n  \"description\": {\n    \"en\": \"Red miso (aka miso) is a deeply fermented soybean paste with a rich, robust, and intensely savory flavor profile. Fermented for longer periods than white or yellow miso, it develops a dark reddish-brown color and complex umami depth.\"\n  },\n  \"category\": \"other\",\n  \"subcategory\": \"fermented soybean paste\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.5,\n      \"sour\": 2.0,\n      \"salty\": 7.5,\n      \"bitter\": 2.5,\n      \"umami\": 9.0\n    },\n    \"structural\": {\n      \"body\": 8.0,\n      \"acidity_perception\": 2.0,\n      \"tannin\": 1.5,\n      \"astringency\": 2.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"savory\",\n        \"group\": \"fermented\",\n        \"item\": \"fermented soybean\",\n        \"qualifier\": \"fermented\",\n        \"intensity\": 9.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"soil\",\n        \"item\": \"earthy\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"pungent\",\n        \"item\": \"pungent\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"dairy\",\n        \"group\": \"aged\",\n        \"item\": \"aged cheese\",\n        \"qualifier\": \"fermented\",\n        \"intensity\": 3.0\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"toasted\",\n        \"item\": \"toasted grain\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"caramel\",\n        \"item\": \"caramel\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 2.0\n      }\n    ],\n    \"texture\": [\"thick\", \"paste-like\", \"dense\", \"granular\"],\n    \"finish_length\": 8.0,\n    \"complexity\": 8.5\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"Japan, particularly Aichi, Gifu, and Mie prefectures (Nagoya region)\",\n      \"ja\": \"日本、特に愛知県・岐阜県・三重県\"\n    },\n    \"season\": {\n      \"en\": \"available year-round; traditionally aged for 1–3 years\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"Longer fermentation periods compared to shiro (white) or shinshu (yellow) miso; darker color and bolder flavor result from extended aging in cedar barrels.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Red miso is made primarily from soybeans with a smaller proportion of grain, fermented for a longer duration than lighter varieties. Its intense umami and salty profile make it suitable for hearty soups, braises, marinades, and glazes.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-06-01T00:00:00Z\",\n    \"updated\": \"2024-06-01T00:00:00Z\",\n    \"source\": \"OGS Core Vocabulary\",\n    \"license\": \"CC BY 4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:aka-miso",
                "expected": "ogs:core:ingredient:aka-miso",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "other",
                "expected": "other",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3066,
            "output_tokens": 976,
            "latency_ms": 12198.1,
            "usd": 0.023838,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:aka-miso\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"ja\"],\n  \"name\": {\n    \"en\": \"miso (red)\",\n    \"ja\": \"赤味噌\"\n  },\n  \"description\": {\n    \"en\": \"A deeply fermented Japanese soybean paste aged for an extended period, yielding a rich reddish-brown color, bold umami depth, and pronounced savory and earthy complexity.\"\n  },\n  \"category\": \"other\",\n  \"subcategory\": \"fermented paste\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.5,\n      \"sour\": 2.5,\n      \"salty\": 8.0,\n      \"bitter\": 2.0,\n      \"umami\": 9.0\n    },\n    \"structural\": {\n      \"body\": 8.0,\n      \"acidity_perception\": 2.5,\n      \"tannin\": 1.5,\n      \"astringency\": 2.0,\n      \"alcohol_heat\": 0.5,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"savory\",\n        \"group\": \"fermented\",\n        \"item\": \"soy ferment\",\n        \"qualifier\": \"fermented\",\n        \"intensity\": 9.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"soil\",\n        \"item\": \"earthy depth\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"roasted grain\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"dairy\",\n        \"group\": \"aged\",\n        \"item\": \"aged umami note\",\n        \"qualifier\": \"fermented\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"legume\",\n        \"item\": \"soybean\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 4.0\n      }\n    ],\n    \"texture\": [\n      \"thick\",\n      \"paste-like\",\n      \"dense\",\n      \"smooth to slightly granular\"\n    ],\n    \"finish_length\": 8.5,\n    \"complexity\": 8.5\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"Japan (particularly Aichi, Gifu, and Mie prefectures)\",\n      \"ja\": \"日本（特に愛知・岐阜・三重県）\"\n    },\n    \"season\": {\n      \"en\": \"year-round; traditionally aged 1–3 years\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"Extended aging produces a deep reddish-brown color and bold, complex flavors with high amino acid content.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Red miso (aka miso) is distinguished from white and mixed misos by its long fermentation period, which intensifies saltiness, umami, and aromatic complexity. It is a cornerstone of Nagoya cuisine and used in hearty soups, braises, and sauces.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Vocabulary\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:aka-miso",
                "expected": "ogs:core:ingredient:aka-miso",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "other",
                "expected": "other",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3066,
            "output_tokens": 931,
            "latency_ms": 11981.2,
            "usd": 0.023163,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:aka-miso\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"ja\",\n  \"available_languages\": [\"ja\", \"en\"],\n  \"name\": {\n    \"ja\": \"赤味噌\",\n    \"en\": \"miso (red)\"\n  },\n  \"description\": {\n    \"en\": \"Red miso (aka miso) is a deeply fermented soybean paste aged for one to three years, characterized by its dark reddish-brown color, robust umami, and pronounced salty-savory complexity.\",\n    \"ja\": \"赤味噌は長期間熟成された大豆由来の発酵調味料で、深い赤褐色と濃厚な旨味、塩味が特徴です。\"\n  },\n  \"category\": \"other\",\n  \"subcategory\": \"fermented paste\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 1.5,\n      \"sour\": 2.0,\n      \"salty\": 7.5,\n      \"bitter\": 2.5,\n      \"umami\": 9.0\n    },\n    \"structural\": {\n      \"body\": 8.0,\n      \"acidity_perception\": 2.0,\n      \"tannin\": 1.5,\n      \"astringency\": 2.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"savory\",\n        \"group\": \"fermented\",\n        \"item\": \"soy paste\",\n        \"qualifier\": \"fermented\",\n        \"intensity\": 9.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mold\",\n        \"item\": \"koji\",\n        \"qualifier\": \"fermented\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"caramel\",\n        \"item\": \"dark caramel\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"toasted grain\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"barrel\",\n        \"item\": \"aged wood\",\n        \"intensity\": 2.5\n      }\n    ],\n    \"texture\": [\n      \"thick\",\n      \"paste-like\",\n      \"dense\",\n      \"smooth to slightly grainy\"\n    ],\n    \"finish_length\": 8.0,\n    \"complexity\": 8.5\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"Aichi, Nagoya, and central Japan\",\n      \"ja\": \"愛知県・名古屋を中心とした中部地方\"\n    },\n    \"season\": {\n      \"en\": \"year-round; traditionally fermented in summer and autumn\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"Long fermentation in cedar barrels imparts depth and color; higher soybean-to-koji ratio contributes to boldness.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Red miso is essential to Nagoya cuisine (Nagoya-meshi), used in dishes such as miso katsu, miso nikomi udon, and dote-ni. Its intense flavor makes it suitable for braises, marinades, and glazes.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Ingredient Registry\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:aka-miso",
                "expected": "ogs:core:ingredient:aka-miso",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "other",
                "expected": "other",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3066,
            "output_tokens": 966,
            "latency_ms": 14471.4,
            "usd": 0.023688,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3066.0,
        "output_tokens_mean": 957.667,
        "latency_ms_mean": 12883.567,
        "usd_sum": 0.070689
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-008",
      "category": "structured_output",
      "capability": "schema_valid_ingredient",
      "difficulty": "hard",
      "cuisine_region": "turkish",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.ingredient",
        "vocab.ingredient_categories",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "791ac1beebd95233",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS ingredient schema (`schema/ingredient.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/ingredient.schema.json\",\n  \"title\": \"OGS Ingredient\",\n  \"description\": \"An ingredient entity with sensory profile, category, and provenance metadata.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"category\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\",\n      \"description\": \"OGS identifier in the format ogs:<namespace>:ingredient:<id>\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\",\n      \"description\": \"OGS specification version (0.1.x or 0.2.x)\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\",\n      \"description\": \"Authoritative / endonym language of this entity's human-readable fields.\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      },\n      \"description\": \"Advisory list of languages covered by this entity's LString fields.\"\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"category\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"protein\",\n        \"vegetable\",\n        \"fruit\",\n        \"grain\",\n        \"dairy\",\n        \"fat\",\n        \"herb\",\n        \"spice\",\n        \"fungus\",\n        \"legume\",\n        \"nut\",\n        \"seed\",\n        \"sweetener\",\n        \"acid\",\n        \"salt\",\n        \"other\"\n      ]\n    },\n    \"subcategory\": {\n      \"type\": \"string\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"origin\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"region\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"season\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"terroir_notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document that describes 'Urfa chili' as an OGS ingredient. It MUST validate against the OGS ingredient schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"ingredient_name\": \"Urfa chili\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"urfa-biber\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/ingredient.schema.json",
        "required_fields": {
          "id": "ogs:core:ingredient:urfa-biber",
          "ogs_version": "0.1.0",
          "category": "spice"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:urfa-biber\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"tr\"],\n  \"name\": {\n    \"en\": \"Urfa chili\",\n    \"tr\": \"Urfa biberi\"\n  },\n  \"description\": {\n    \"en\": \"A dark, smoky dried chili pepper from the Urfa region of Turkey. After harvesting, the peppers undergo a unique curing process of sun-drying by day and wrapping by night, resulting in an oily, almost raisin-like texture and a complex, deeply smoky, fruity flavor with mild heat.\"\n  },\n  \"category\": \"spice\",\n  \"subcategory\": \"dried chili\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 4.5,\n      \"sour\": 1.0,\n      \"salty\": 1.5,\n      \"bitter\": 2.5,\n      \"umami\": 3.0\n    },\n    \"structural\": {\n      \"body\": 6.5,\n      \"acidity_perception\": 1.5,\n      \"astringency\": 2.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"spice\",\n        \"group\": \"chili\",\n        \"item\": \"dried chili\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 8.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"smoke\",\n        \"item\": \"wood smoke\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"dried fruit\",\n        \"item\": \"raisin\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"chocolate\",\n        \"item\": \"dark chocolate\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"stone fruit\",\n        \"item\": \"prune\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 4.0\n      }\n    ],\n    \"texture\": [\"oily\", \"moist\", \"flaky\"],\n    \"finish_length\": 7.0,\n    \"complexity\": 8.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"Urfa (Şanlıurfa), southeastern Turkey\",\n      \"tr\": \"Şanlıurfa, Güneydoğu Türkiye\"\n    },\n    \"season\": {\n      \"en\": \"Harvested in late summer; cured through autumn\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"The semi-arid climate and intense sun of the Şanlıurfa plateau, combined with the traditional day-sun / night-wrap curing method, impart the distinctive oily texture and complex smoky-fruity character unique to Urfa biber.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Also known as 'isot pepper'. The unique curing process retains natural oils, giving the flakes a slightly wet, glossy appearance. Heat level is mild to medium (approximately 30,000–50,000 Scoville units). Commonly used in Turkish cuisine, mezze, grilled meats, and increasingly in modern Western cooking.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Ingredient Registry\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:urfa-biber",
                "expected": "ogs:core:ingredient:urfa-biber",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "spice",
                "expected": "spice",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3068,
            "output_tokens": 943,
            "latency_ms": 14476.5,
            "usd": 0.023349,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:urfa-biber\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"tr\"],\n  \"name\": {\n    \"en\": \"Urfa chili\",\n    \"tr\": \"Urfa biberi\"\n  },\n  \"description\": {\n    \"en\": \"Urfa biber (isot pepper) is a dried Turkish chili pepper from the Urfa region, processed through a unique sweating technique that gives it a deep burgundy-black color, oily texture, and complex flavor profile combining moderate heat with notes of chocolate, dried fruit, tobacco, and coffee.\"\n  },\n  \"category\": \"spice\",\n  \"subcategory\": \"dried chili pepper\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 4.5,\n      \"sour\": 1.0,\n      \"salty\": 1.5,\n      \"bitter\": 3.0,\n      \"umami\": 3.5\n    },\n    \"structural\": {\n      \"body\": 7.0,\n      \"acidity_perception\": 1.5,\n      \"astringency\": 2.5\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"spice\",\n        \"group\": \"chili\",\n        \"item\": \"dried chili\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"chocolate\",\n        \"item\": \"dark chocolate\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"dried fruit\",\n        \"item\": \"raisin\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"tobacco\",\n        \"item\": \"tobacco leaf\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"savory\",\n        \"group\": \"roasted\",\n        \"item\": \"coffee\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"dried fruit\",\n        \"item\": \"dried plum\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 4.0\n      }\n    ],\n    \"texture\": [\n      \"oily\",\n      \"coarse\",\n      \"flaky\"\n    ],\n    \"finish_length\": 7.5,\n    \"complexity\": 8.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"Urfa (Şanlıurfa), southeastern Turkey\",\n      \"tr\": \"Şanlıurfa, Türkiye\"\n    },\n    \"season\": {\n      \"en\": \"Harvested in late summer, dried and processed in autumn\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"Grown in the hot, arid climate of southeastern Anatolia. The unique double-processing method — sun-drying by day and sweating wrapped in cloth by night — develops the characteristic oiliness, dark color, and complex flavor.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Urfa biber contains natural oils that give it a moist, flaky texture unlike most dried chilies. Heat level is mild to moderate (approximately 30,000–50,000 Scoville units). Pairs exceptionally well with lamb, eggplant, chocolate, cheese, and red wine. Common in Turkish and Middle Eastern cuisines.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Vocabulary\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:urfa-biber",
                "expected": "ogs:core:ingredient:urfa-biber",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "spice",
                "expected": "spice",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3068,
            "output_tokens": 978,
            "latency_ms": 14771.3,
            "usd": 0.023874,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:urfa-biber\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"tr\"],\n  \"name\": {\n    \"en\": \"Urfa chili\",\n    \"tr\": \"Urfa biberi\"\n  },\n  \"description\": {\n    \"en\": \"Urfa biber is a semi-dried Turkish chili pepper originating from the Urfa region of southeastern Turkey. It is dark brownish-red in color and prized for its complex, smoky, raisin-like flavor with moderate heat and an oily texture.\"\n  },\n  \"category\": \"spice\",\n  \"subcategory\": \"dried chili\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 4.5,\n      \"sour\": 1.5,\n      \"salty\": 1.0,\n      \"bitter\": 2.5,\n      \"umami\": 3.0\n    },\n    \"structural\": {\n      \"body\": 6.5,\n      \"acidity_perception\": 1.5,\n      \"astringency\": 2.5\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"spice\",\n        \"group\": \"chili\",\n        \"item\": \"ancho-like chili\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 8.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"smoke\",\n        \"item\": \"smoky\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"dried fruit\",\n        \"item\": \"raisin\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"chocolate\",\n        \"item\": \"dark chocolate\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"cumin\",\n        \"intensity\": 3.0\n      }\n    ],\n    \"texture\": [\"oily\", \"coarse\", \"slightly sticky\"],\n    \"finish_length\": 7.0,\n    \"complexity\": 8.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"Urfa (Şanlıurfa), southeastern Turkey\",\n      \"tr\": \"Şanlıurfa, Güneydoğu Türkiye\"\n    },\n    \"season\": {\n      \"en\": \"Harvested in late summer; sun-dried and wrapped at night through autumn\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"The unique day-night temperature swing of the Urfa plateau produces the characteristic oiliness and deep color. Peppers are sun-dried by day and wrapped in cloth at night, creating a semi-moist, fermented quality.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Also called isot pepper or urfa biber. The night-wrapping process gives the pepper a slightly fermented, chocolaty depth not found in other dried chilies. Heat level is moderate (Scoville ~30,000-50,000 SHU). Widely used in Turkish, Middle Eastern, and modern Western cuisine.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS core editorial team\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:urfa-biber",
                "expected": "ogs:core:ingredient:urfa-biber",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "spice",
                "expected": "spice",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3068,
            "output_tokens": 921,
            "latency_ms": 15440.5,
            "usd": 0.023019,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3068.0,
        "output_tokens_mean": 947.333,
        "latency_ms_mean": 14896.1,
        "usd_sum": 0.070242
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-009",
      "category": "structured_output",
      "capability": "schema_valid_ingredient",
      "difficulty": "easy",
      "cuisine_region": "american",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.ingredient",
        "vocab.ingredient_categories",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "a6a51cef64e43626",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS ingredient schema (`schema/ingredient.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/ingredient.schema.json\",\n  \"title\": \"OGS Ingredient\",\n  \"description\": \"An ingredient entity with sensory profile, category, and provenance metadata.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"category\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\",\n      \"description\": \"OGS identifier in the format ogs:<namespace>:ingredient:<id>\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\",\n      \"description\": \"OGS specification version (0.1.x or 0.2.x)\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\",\n      \"description\": \"Authoritative / endonym language of this entity's human-readable fields.\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      },\n      \"description\": \"Advisory list of languages covered by this entity's LString fields.\"\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"category\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"protein\",\n        \"vegetable\",\n        \"fruit\",\n        \"grain\",\n        \"dairy\",\n        \"fat\",\n        \"herb\",\n        \"spice\",\n        \"fungus\",\n        \"legume\",\n        \"nut\",\n        \"seed\",\n        \"sweetener\",\n        \"acid\",\n        \"salt\",\n        \"other\"\n      ]\n    },\n    \"subcategory\": {\n      \"type\": \"string\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"origin\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"region\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"season\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"terroir_notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document that describes 'Meyer lemon' as an OGS ingredient. It MUST validate against the OGS ingredient schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"ingredient_name\": \"Meyer lemon\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"meyer-lemon\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/ingredient.schema.json",
        "required_fields": {
          "id": "ogs:core:ingredient:meyer-lemon",
          "ogs_version": "0.1.0",
          "category": "fruit"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:meyer-lemon\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\"],\n  \"name\": {\n    \"en\": \"Meyer lemon\"\n  },\n  \"description\": {\n    \"en\": \"Meyer lemon is a hybrid citrus fruit believed to be a cross between a true lemon and a mandarin orange or common orange. It is sweeter and less acidic than a standard lemon, with a thin, smooth, deep-yellow to orange-yellow skin and intensely floral, fruity aroma.\"\n  },\n  \"category\": \"fruit\",\n  \"subcategory\": \"citrus\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 6.5,\n      \"sour\": 6.0,\n      \"salty\": 0,\n      \"bitter\": 2.0,\n      \"umami\": 0\n    },\n    \"structural\": {\n      \"body\": 3.0,\n      \"acidity_perception\": 6.0,\n      \"tannin\": 0.5,\n      \"astringency\": 1.5,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 8.0\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"orange\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"white flower\",\n        \"item\": \"orange blossom\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon zest\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"sweet\",\n        \"item\": \"lemon curd\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 3.5\n      }\n    ],\n    \"texture\": [\n      \"juicy\",\n      \"tender\",\n      \"smooth rind\",\n      \"thin-skinned\"\n    ],\n    \"finish_length\": 5.5,\n    \"complexity\": 6.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"China; widely cultivated in California and Mediterranean regions\"\n    },\n    \"season\": {\n      \"en\": \"Late fall through early spring (November to April)\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"Warm, frost-free climates enhance sweetness; coastal California conditions produce particularly aromatic fruit.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Meyer lemons are prized in culinary applications for their sweeter, more complex flavor compared to Eureka or Lisbon lemons. They are commonly used in desserts, preserves, vinaigrettes, and cocktails.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Ingredient Registry\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:meyer-lemon",
                "expected": "ogs:core:ingredient:meyer-lemon",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "fruit",
                "expected": "fruit",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3063,
            "output_tokens": 886,
            "latency_ms": 12997.3,
            "usd": 0.022479,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:meyer-lemon\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\"],\n  \"name\": {\n    \"en\": \"Meyer lemon\"\n  },\n  \"description\": {\n    \"en\": \"A citrus fruit thought to be a cross between a true lemon and a mandarin orange or common orange. Meyer lemons are smaller and rounder than true lemons, with a thin, deep-yellow to orange skin and sweeter, less acidic juice.\"\n  },\n  \"category\": \"fruit\",\n  \"subcategory\": \"citrus\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 6.5,\n      \"sour\": 6.0,\n      \"salty\": 0,\n      \"bitter\": 2.0,\n      \"umami\": 0\n    },\n    \"structural\": {\n      \"body\": 3.0,\n      \"acidity_perception\": 6.0,\n      \"tannin\": 0.5,\n      \"astringency\": 1.5,\n      \"alcohol_heat\": 0,\n      \"effervescence\": 0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 8.0\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"orange blossom\",\n        \"item\": \"orange blossom\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"mandarin\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon zest\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"sweet\",\n        \"item\": \"lemon curd\",\n        \"intensity\": 3.5\n      }\n    ],\n    \"texture\": [\n      \"juicy\",\n      \"smooth\",\n      \"thin-skinned\"\n    ],\n    \"finish_length\": 5.0,\n    \"complexity\": 5.5\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"Originally from China; widely cultivated in California, USA\"\n    },\n    \"season\": {\n      \"en\": \"November through March, with peak in winter\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"Meyer lemons grown in coastal California tend to develop a particularly floral, sweet character due to mild temperatures and foggy conditions.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Meyer lemons are prized for their sweeter, less astringent juice compared to Eureka or Lisbon lemons. The entire fruit, including the thin skin, is edible and commonly used in both sweet and savory preparations.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Ingredient Catalog\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:meyer-lemon",
                "expected": "ogs:core:ingredient:meyer-lemon",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "fruit",
                "expected": "fruit",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3063,
            "output_tokens": 868,
            "latency_ms": 12353.4,
            "usd": 0.022209,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:meyer-lemon\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\"],\n  \"name\": {\n    \"en\": \"Meyer lemon\"\n  },\n  \"description\": {\n    \"en\": \"Meyer lemon is a citrus fruit hybrid between a true lemon and a mandarin orange. It is sweeter and less acidic than a standard lemon, with thin, fragrant, deep-yellow to orange skin and floral, complex juice.\"\n  },\n  \"category\": \"fruit\",\n  \"subcategory\": \"citrus\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 5.5,\n      \"sour\": 6.5,\n      \"salty\": 0,\n      \"bitter\": 2.0,\n      \"umami\": 0\n    },\n    \"structural\": {\n      \"body\": 3.0,\n      \"acidity_perception\": 6.5,\n      \"tannin\": 0.5,\n      \"astringency\": 1.5\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 8.0\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"blossom\",\n        \"item\": \"orange blossom\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"mandarin\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"vanilla\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 2.0\n      }\n    ],\n    \"texture\": [\n      \"juicy\",\n      \"smooth\",\n      \"thin-skinned\"\n    ],\n    \"finish_length\": 5.0,\n    \"complexity\": 6.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"en\": \"China; widely cultivated in California, USA\"\n    },\n    \"season\": {\n      \"en\": \"Late autumn through early spring (November to April)\"\n    },\n    \"terroir_notes\": {\n      \"en\": \"Warm, Mediterranean climates bring out floral and sweet notes; cooler coastal regions tend to intensify acidity.\"\n    }\n  },\n  \"notes\": {\n    \"en\": \"Meyer lemons are prized in culinary applications for their sweeter, more nuanced flavor compared to Eureka or Lisbon lemons. Both the juice and zest are used extensively in desserts, dressings, and cocktails.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Ingredient Catalog\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:meyer-lemon",
                "expected": "ogs:core:ingredient:meyer-lemon",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "fruit",
                "expected": "fruit",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3063,
            "output_tokens": 799,
            "latency_ms": 12477.2,
            "usd": 0.021174,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3063.0,
        "output_tokens_mean": 851.0,
        "latency_ms_mean": 12609.3,
        "usd_sum": 0.065862
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-010",
      "category": "structured_output",
      "capability": "schema_valid_ingredient",
      "difficulty": "hard",
      "cuisine_region": "mexican",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.ingredient",
        "vocab.ingredient_categories",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "6d4d6d3cc23f3283",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS ingredient schema (`schema/ingredient.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/ingredient.schema.json\",\n  \"title\": \"OGS Ingredient\",\n  \"description\": \"An ingredient entity with sensory profile, category, and provenance metadata.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"category\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\",\n      \"description\": \"OGS identifier in the format ogs:<namespace>:ingredient:<id>\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\",\n      \"description\": \"OGS specification version (0.1.x or 0.2.x)\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\",\n      \"description\": \"Authoritative / endonym language of this entity's human-readable fields.\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      },\n      \"description\": \"Advisory list of languages covered by this entity's LString fields.\"\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"category\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"protein\",\n        \"vegetable\",\n        \"fruit\",\n        \"grain\",\n        \"dairy\",\n        \"fat\",\n        \"herb\",\n        \"spice\",\n        \"fungus\",\n        \"legume\",\n        \"nut\",\n        \"seed\",\n        \"sweetener\",\n        \"acid\",\n        \"salt\",\n        \"other\"\n      ]\n    },\n    \"subcategory\": {\n      \"type\": \"string\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"origin\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"region\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"season\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"terroir_notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Ingredient categories\n\nprotein, vegetable, fruit, grain, dairy, fat, herb, spice, fungus, legume, nut, seed, sweetener, acid, salt, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document that describes 'wild huitlacoche' as an OGS ingredient. It MUST validate against the OGS ingredient schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"ingredient_name\": \"wild huitlacoche\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"huitlacoche\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/ingredient.schema.json",
        "required_fields": {
          "id": "ogs:core:ingredient:huitlacoche",
          "ogs_version": "0.1.0",
          "category": "fungus"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:huitlacoche\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"es\",\n  \"available_languages\": [\"es\", \"en\"],\n  \"name\": {\n    \"es\": \"huitlacoche silvestre\",\n    \"en\": \"wild huitlacoche\"\n  },\n  \"description\": {\n    \"es\": \"Hongo parásito del maíz (Ustilago maydis) recolectado en estado silvestre, considerado un manjar en la cocina mexicana por su sabor terroso, ahumado y umami.\",\n    \"en\": \"Wild-harvested corn smut fungus (Ustilago maydis), prized in Mexican cuisine for its earthy, smoky, and deeply umami flavor profile.\"\n  },\n  \"category\": \"fungus\",\n  \"subcategory\": \"corn smut\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.0,\n      \"sour\": 0.5,\n      \"salty\": 1.0,\n      \"bitter\": 1.5,\n      \"umami\": 8.0\n    },\n    \"structural\": {\n      \"body\": 7.0,\n      \"acidity_perception\": 1.0,\n      \"tannin\": 0.5,\n      \"astringency\": 1.5\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"earth\",\n        \"group\": \"fungal\",\n        \"item\": \"mushroom\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 8.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"soil\",\n        \"item\": \"petrichor\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"savory\",\n        \"group\": \"umami\",\n        \"item\": \"corn-truffle\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"corn\",\n        \"item\": \"roasted corn\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"smoky\",\n        \"item\": \"smoke\",\n        \"intensity\": 3.5\n      }\n    ],\n    \"texture\": [\n      \"soft\",\n      \"creamy\",\n      \"gelatinous\",\n      \"slightly chewy\"\n    ],\n    \"finish_length\": 7.0,\n    \"complexity\": 8.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"es\": \"México y América Central\",\n      \"en\": \"Mexico and Central America\"\n    },\n    \"season\": {\n      \"es\": \"temporada de lluvias, verano\",\n      \"en\": \"rainy season, summer\"\n    },\n    \"terroir_notes\": {\n      \"es\": \"Crece de forma silvestre en milpas y campos de maíz, especialmente tras las lluvias estivales.\",\n      \"en\": \"Grows wild on milpa and maize fields, particularly after summer rains.\"\n    }\n  },\n  \"notes\": {\n    \"es\": \"Conocido también como cuitlacoche o maíz enfermo; considerado el 'trufa mexicana'. Utilizado en quesadillas, salsas, sopas y tamales.\",\n    \"en\": \"Also known as cuitlacoche or corn mushroom; often called the 'Mexican truffle'. Used in quesadillas, salsas, soups, and tamales.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Vocabulary\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:huitlacoche",
                "expected": "ogs:core:ingredient:huitlacoche",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "fungus",
                "expected": "fungus",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3070,
            "output_tokens": 985,
            "latency_ms": 14384.8,
            "usd": 0.023985,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:huitlacoche\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"es\",\n  \"available_languages\": [\"es\", \"en\"],\n  \"name\": {\n    \"es\": \"huitlacoche silvestre\",\n    \"en\": \"wild huitlacoche\"\n  },\n  \"description\": {\n    \"es\": \"Hongo parásito del maíz (Ustilago maydis) que crece de forma silvestre, apreciado en la cocina mexicana por su sabor terroso y umami.\",\n    \"en\": \"A wild corn fungus (Ustilago maydis) that parasitizes maize ears, prized in Mexican cuisine for its earthy, umami-rich flavor.\"\n  },\n  \"category\": \"fungus\",\n  \"subcategory\": \"corn smut\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.5,\n      \"sour\": 0.5,\n      \"salty\": 0.5,\n      \"bitter\": 1.5,\n      \"umami\": 8.0\n    },\n    \"structural\": {\n      \"body\": 7.0,\n      \"acidity_perception\": 1.0,\n      \"tannin\": 0.5,\n      \"astringency\": 1.5\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"earth\",\n        \"group\": \"mushroom\",\n        \"item\": \"truffle-like earthiness\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 8.5\n      },\n      {\n        \"family\": \"savory\",\n        \"group\": \"umami\",\n        \"item\": \"corn broth\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"corn\",\n        \"item\": \"roasted corn\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"soil\",\n        \"item\": \"loamy earth\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"black pepper\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 2.0\n      }\n    ],\n    \"texture\": [\n      \"soft\",\n      \"gelatinous\",\n      \"tender\",\n      \"slightly chewy\"\n    ],\n    \"finish_length\": 7.0,\n    \"complexity\": 8.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"es\": \"México y Mesoamérica\",\n      \"en\": \"Mexico and Mesoamerica\"\n    },\n    \"season\": {\n      \"es\": \"temporada de lluvias (verano)\",\n      \"en\": \"rainy season (summer)\"\n    },\n    \"terroir_notes\": {\n      \"es\": \"Crece de forma silvestre en milpas y campos de maíz, influenciado por la variedad de maíz local y las condiciones de humedad.\",\n      \"en\": \"Grows wild on milpa and cornfields; flavor influenced by local maize variety and moisture conditions.\"\n    }\n  },\n  \"notes\": {\n    \"es\": \"Conocido también como cuitlacoche; considerado un manjar en México. Las esporas maduras tienen una coloración negra-grisácea característica.\",\n    \"en\": \"Also known as cuitlacoche; considered a delicacy in Mexico. Mature spores display a characteristic black-grey coloration.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Ingredient Registry\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:ingredient:huitlacoche",
                "expected": "ogs:core:ingredient:huitlacoche",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "category": {
                "predicted": "fungus",
                "expected": "fungus",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3070,
            "output_tokens": 1010,
            "latency_ms": 15060.6,
            "usd": 0.02436,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:ingredient:huitlacoche\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"es\",\n  \"available_languages\": [\"es\", \"en\"],\n  \"name\": {\n    \"es\": \"huitlacoche silvestre\",\n    \"en\": \"wild huitlacoche\"\n  },\n  \"description\": {\n    \"es\": \"Hongo parásito del maíz (Ustilago maydis) que crece de forma silvestre, apreciado en la cocina mexicana por su sabor terroso y umami.\",\n    \"en\": \"Wild corn smut fungus (Ustilago maydis) that grows parasitically on maize ears, prized in Mexican cuisine for its earthy, umami-rich flavor.\"\n  },\n  \"category\": \"fungus\",\n  \"subcategory\": \"corn smut\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.0,\n      \"sour\": 0.5,\n      \"salty\": 1.0,\n      \"bitter\": 1.5,\n      \"umami\": 8.0\n    },\n    \"structural\": {\n      \"body\": 7.0,\n      \"acidity_perception\": 1.0,\n      \"tannin\": 0.5,\n      \"astringency\": 1.0,\n      \"alcohol_heat\": 0.0,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"earth\",\n        \"group\": \"fungal\",\n        \"item\": \"mushroom\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 8.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"soil\",\n        \"item\": \"loam\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"savory\",\n        \"group\": \"umami\",\n        \"item\": \"corn\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"vegetal\",\n        \"group\": \"grain\",\n        \"item\": \"sweet corn\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"dark spice\",\n        \"item\": \"black truffle\",\n        \"intensity\": 4.0\n      }\n    ],\n    \"texture\": [\n      \"creamy\",\n      \"soft\",\n      \"slightly gelatinous\",\n      \"moist\"\n    ],\n    \"finish_length\": 7.0,\n    \"complexity\": 8.0\n  },\n  \"origin\": {\n    \"region\": {\n      \"es\": \"México y Mesoamérica\",\n      \"en\": \"Mexico and Mesoamerica\"\n    },\n    \"season\": {\n      \"es\": \"verano, temporada de lluvias (julio–septiembre)\",\n      \"en\": \"summer, rainy season (July–September)\"\n    },\n    \"terroir_notes\": {\n      \"es\": \"Crece principalmente en campos de maíz nativo, con mejor calidad en regiones de altitud media del centro de México.\",\n      \"en\": \"Grows primarily on native maize fields; best quality found in mid-altitude regions of central Mexico.\"\n    }\n  },\n  \"notes\": {\n    \"es\": \"El huitlacoche silvestre es considerado una delicatessen en la cocina mexicana. Se usa en quesadillas, sopas, tamales y salsas. Conocido también como 'trufa del maíz'.\",\n    \"en\": \"Wild huitlacoche is considered a delicacy in Mexican cuisine. Used in quesadillas, soups, tamales, and sauces. Also known as 'corn truffle' or 'Mexican truffle'.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "['es', 'en'] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:ingredient:huitlacoche",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              },
              "category": {
                "predicted": null,
                "expected": "fungus",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 3070,
            "output_tokens": 1024,
            "latency_ms": 14439.0,
            "usd": 0.02457,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.5774,
        "sem": 0.3333,
        "ci95_lo": 0.0133,
        "ci95_hi": 1.32,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3070.0,
        "output_tokens_mean": 1006.333,
        "latency_ms_mean": 14628.133,
        "usd_sum": 0.072915
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-011",
      "category": "structured_output",
      "capability": "schema_valid_beverage",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.beverage",
        "vocab.beverage_types",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "6615664480900680",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS beverage schema (`schema/beverage.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/beverage.schema.json\",\n  \"title\": \"OGS Beverage\",\n  \"description\": \"A beverage entity with type, structural attributes, sensory profile, and provenance.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"type\",\n    \"sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"type\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"wine\",\n        \"beer\",\n        \"spirit\",\n        \"cider\",\n        \"sake\",\n        \"other\"\n      ]\n    },\n    \"subtype\": {\n      \"type\": \"string\"\n    },\n    \"grape_varieties\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"type\": \"string\"\n      }\n    },\n    \"vintage\": {\n      \"type\": \"integer\"\n    },\n    \"region\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"appellation\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"producer\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"abv\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"residual_sugar\": {\n      \"type\": \"number\",\n      \"minimum\": 0\n    },\n    \"serve_temperature_c\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"min\": {\n          \"type\": \"number\"\n        },\n        \"max\": {\n          \"type\": \"number\"\n        }\n      },\n      \"additionalProperties\": false,\n      \"description\": \"Recommended serving temperature range in degrees Celsius (SI canonical).\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document describing 'Barolo' as an OGS beverage. It MUST validate against the OGS beverage schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"beverage_name\": \"Barolo\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"barolo-docg\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/beverage.schema.json",
        "required_fields": {
          "id": "ogs:core:beverage:barolo-docg",
          "ogs_version": "0.1.0",
          "type": "wine"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:barolo-docg\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"it\"],\n  \"name\": {\n    \"en\": \"Barolo\",\n    \"it\": \"Barolo\"\n  },\n  \"description\": {\n    \"en\": \"Barolo is a prestigious red wine produced in the Langhe hills of Piedmont, Italy, from the Nebbiolo grape. Often called the 'King of Italian Wines', it is known for its powerful tannins, high acidity, complex aromatics of roses, tar, and dried fruit, and exceptional aging potential.\",\n    \"it\": \"Il Barolo è un prestigioso vino rosso prodotto nelle Langhe, in Piemonte, Italia, dal vitigno Nebbiolo. Spesso chiamato 'il Re dei vini italiani', è noto per i suoi tannini potenti, l'elevata acidità e la complessità aromatica.\"\n  },\n  \"type\": \"wine\",\n  \"subtype\": \"red\",\n  \"grape_varieties\": [\"Nebbiolo\"],\n  \"region\": {\n    \"en\": \"Langhe, Piedmont, Italy\",\n    \"it\": \"Langhe, Piemonte, Italia\"\n  },\n  \"appellation\": {\n    \"en\": \"Barolo DOCG\",\n    \"it\": \"Barolo DOCG\"\n  },\n  \"abv\": 14.0,\n  \"residual_sugar\": 2.0,\n  \"serve_temperature_c\": {\n    \"min\": 16,\n    \"max\": 18\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.0,\n      \"sour\": 7.5,\n      \"salty\": 0.5,\n      \"bitter\": 5.5,\n      \"umami\": 2.5\n    },\n    \"structural\": {\n      \"body\": 8.5,\n      \"acidity_perception\": 7.5,\n      \"tannin\": 9.0,\n      \"astringency\": 8.0,\n      \"alcohol_heat\": 6.5,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"floral\",\n        \"group\": \"rose\",\n        \"item\": \"dried rose\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 8.0\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"red fruit\",\n        \"item\": \"cherry\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"dark fruit\",\n        \"item\": \"plum\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"tar\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"licorice\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"forest floor\",\n        \"item\": \"truffle\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"oak\",\n        \"item\": \"vanilla\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"oak\",\n        \"item\": \"cedar\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"aromatic\",\n        \"item\": \"clove\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"dried herb\",\n        \"item\": \"tobacco\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 5.0\n      }\n    ],\n    \"texture",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "['en', 'it'] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:beverage:barolo-docg",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              },
              "type": {
                "predicted": null,
                "expected": "wine",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 3083,
            "output_tokens": 1024,
            "latency_ms": 13568.4,
            "usd": 0.024609,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:barolo-docg\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"it\"],\n  \"name\": {\n    \"en\": \"Barolo\",\n    \"it\": \"Barolo\"\n  },\n  \"description\": {\n    \"en\": \"Barolo is a prestigious red wine from the Langhe hills of Piedmont, Italy, produced from the Nebbiolo grape. Often called the 'King of Italian Wines', it is renowned for its power, complexity, and exceptional aging potential.\",\n    \"it\": \"Il Barolo è un vino rosso prestigioso prodotto nelle colline delle Langhe in Piemonte, Italia, dalla varietà Nebbiolo. Spesso chiamato il 'Re dei Vini Italiani'.\"\n  },\n  \"type\": \"wine\",\n  \"subtype\": \"red\",\n  \"grape_varieties\": [\"Nebbiolo\"],\n  \"region\": {\n    \"en\": \"Piedmont, Italy\",\n    \"it\": \"Piemonte, Italia\"\n  },\n  \"appellation\": {\n    \"en\": \"Barolo DOCG\",\n    \"it\": \"Barolo DOCG\"\n  },\n  \"abv\": 14.0,\n  \"residual_sugar\": 2.0,\n  \"serve_temperature_c\": {\n    \"min\": 16,\n    \"max\": 18\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.0,\n      \"sour\": 5.5,\n      \"salty\": 1.0,\n      \"bitter\": 4.5,\n      \"umami\": 2.5\n    },\n    \"structural\": {\n      \"body\": 8.5,\n      \"acidity_perception\": 7.5,\n      \"tannin\": 8.5,\n      \"astringency\": 7.0,\n      \"alcohol_heat\": 6.5,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"red fruit\",\n        \"item\": \"cherry\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"red fruit\",\n        \"item\": \"plum\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"dried flower\",\n        \"item\": \"rose\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"dried flower\",\n        \"item\": \"violet\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"tar\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"forest floor\",\n        \"item\": \"truffle\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"clove\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"cinnamon\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"oak\",\n        \"item\": \"vanilla\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"oak\",\n        \"item\": \"cedar\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"savory\",\n        \"group\": \"leather\",\n        \"item\": \"leather\",\n        \"intensity\": 5.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "['en', 'it'] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:beverage:barolo-docg",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              },
              "type": {
                "predicted": null,
                "expected": "wine",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 3083,
            "output_tokens": 1023,
            "latency_ms": 12775.7,
            "usd": 0.024594,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:barolo-docg\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"it\"],\n  \"name\": {\n    \"en\": \"Barolo\",\n    \"it\": \"Barolo\"\n  },\n  \"description\": {\n    \"en\": \"Barolo is a prestigious Italian red wine produced in the Langhe hills of Piedmont, Italy, from the Nebbiolo grape. Known as the 'King of Wines', it is celebrated for its power, complexity, and exceptional aging potential.\",\n    \"it\": \"Il Barolo è un prestigioso vino rosso italiano prodotto nelle colline delle Langhe in Piemonte, ottenuto dal vitigno Nebbiolo. Conosciuto come il 'Re dei Vini', è celebrato per la sua potenza, complessità e straordinaria capacità di invecchiamento.\"\n  },\n  \"type\": \"wine\",\n  \"subtype\": \"red\",\n  \"grape_varieties\": [\"Nebbiolo\"],\n  \"region\": {\n    \"en\": \"Langhe, Piedmont, Italy\",\n    \"it\": \"Langhe, Piemonte, Italia\"\n  },\n  \"appellation\": {\n    \"en\": \"Barolo DOCG\",\n    \"it\": \"Barolo DOCG\"\n  },\n  \"abv\": 14.0,\n  \"residual_sugar\": 2.5,\n  \"serve_temperature_c\": {\n    \"min\": 16,\n    \"max\": 18\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 2.0,\n      \"sour\": 5.5,\n      \"salty\": 0.5,\n      \"bitter\": 4.0,\n      \"umami\": 2.5\n    },\n    \"structural\": {\n      \"body\": 8.5,\n      \"acidity_perception\": 7.5,\n      \"tannin\": 8.5,\n      \"astringency\": 7.5,\n      \"alcohol_heat\": 6.5,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"dark fruit\",\n        \"item\": \"cherry\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"dark fruit\",\n        \"item\": \"plum\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"red fruit\",\n        \"item\": \"raspberry\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"dried flower\",\n        \"item\": \"rose\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"dried flower\",\n        \"item\": \"violet\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"tar\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"soil\",\n        \"item\": \"truffle\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"iron\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"cinnamon\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"clove\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 4.5",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "['en', 'it'] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:beverage:barolo-docg",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              },
              "type": {
                "predicted": null,
                "expected": "wine",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 3083,
            "output_tokens": 1024,
            "latency_ms": 12573.1,
            "usd": 0.024609,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3083.0,
        "output_tokens_mean": 1023.667,
        "latency_ms_mean": 12972.4,
        "usd_sum": 0.073812
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-012",
      "category": "structured_output",
      "capability": "schema_valid_beverage",
      "difficulty": "hard",
      "cuisine_region": "scottish",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.beverage",
        "vocab.beverage_types",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "bd330891a3e68b59",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS beverage schema (`schema/beverage.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/beverage.schema.json\",\n  \"title\": \"OGS Beverage\",\n  \"description\": \"A beverage entity with type, structural attributes, sensory profile, and provenance.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"type\",\n    \"sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"type\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"wine\",\n        \"beer\",\n        \"spirit\",\n        \"cider\",\n        \"sake\",\n        \"other\"\n      ]\n    },\n    \"subtype\": {\n      \"type\": \"string\"\n    },\n    \"grape_varieties\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"type\": \"string\"\n      }\n    },\n    \"vintage\": {\n      \"type\": \"integer\"\n    },\n    \"region\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"appellation\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"producer\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"abv\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"residual_sugar\": {\n      \"type\": \"number\",\n      \"minimum\": 0\n    },\n    \"serve_temperature_c\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"min\": {\n          \"type\": \"number\"\n        },\n        \"max\": {\n          \"type\": \"number\"\n        }\n      },\n      \"additionalProperties\": false,\n      \"description\": \"Recommended serving temperature range in degrees Celsius (SI canonical).\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document describing 'Islay single malt' as an OGS beverage. It MUST validate against the OGS beverage schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"beverage_name\": \"Islay single malt\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"ardbeg-10-islay\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/beverage.schema.json",
        "required_fields": {
          "id": "ogs:core:beverage:ardbeg-10-islay",
          "ogs_version": "0.1.0",
          "type": "spirit"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:ardbeg-10-islay\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Islay single malt\",\n  \"type\": \"spirit\",\n  \"subtype\": \"single malt Scotch whisky\",\n  \"region\": \"Islay, Scotland\",\n  \"abv\": 46.0,\n  \"serve_temperature_c\": {\n    \"min\": 15,\n    \"max\": 20\n  },\n  \"description\": \"A classic Islay single malt Scotch whisky, renowned for its intense peat smoke, maritime character, and medicinal depth. Produced on the island of Islay off the west coast of Scotland.\",\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 3.5,\n      \"sour\": 2.0,\n      \"salty\": 2.5,\n      \"bitter\": 4.0,\n      \"umami\": 1.5\n    },\n    \"structural\": {\n      \"body\": 7.0,\n      \"acidity_perception\": 2.5,\n      \"tannin\": 3.0,\n      \"astringency\": 2.5,\n      \"alcohol_heat\": 7.5,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"earth\",\n        \"group\": \"peat\",\n        \"item\": \"smoke\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 9.0\n      },\n      {\n        \"family\": \"marine\",\n        \"group\": \"seaweed\",\n        \"item\": \"brine\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"medicinal\",\n        \"item\": \"iodine\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"stone fruit\",\n        \"item\": \"peach\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"black pepper\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"oak\",\n        \"item\": \"vanilla\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"sweet\",\n        \"item\": \"dark chocolate\",\n        \"intensity\": 3.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"mineral\",\n        \"item\": \"ash\",\n        \"intensity\": 6.0\n      }\n    ],\n    \"texture\": [\n      \"oily\",\n      \"warming\",\n      \"full-bodied\"\n    ],\n    \"finish_length\": 9.0,\n    \"complexity\": 8.5\n  },\n  \"metadata\": {\n    \"source\": \"OGS Core Benchmark\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:ardbeg-10-islay",
                "expected": "ogs:core:beverage:ardbeg-10-islay",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "spirit",
                "expected": "spirit",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3092,
            "output_tokens": 786,
            "latency_ms": 11034.7,
            "usd": 0.021066,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:ardbeg-10-islay\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Islay single malt\",\n  \"type\": \"spirit\",\n  \"subtype\": \"single malt Scotch whisky\",\n  \"region\": \"Islay, Scotland\",\n  \"abv\": 46.0,\n  \"serve_temperature_c\": {\n    \"min\": 15,\n    \"max\": 20\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 3.5,\n      \"sour\": 1.5,\n      \"salty\": 2.0,\n      \"bitter\": 4.0,\n      \"umami\": 1.5\n    },\n    \"structural\": {\n      \"body\": 7.0,\n      \"acidity_perception\": 2.5,\n      \"tannin\": 2.0,\n      \"astringency\": 2.5,\n      \"alcohol_heat\": 7.5,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"earth\",\n        \"group\": \"peat\",\n        \"item\": \"smoke\",\n        \"intensity\": 9.0\n      },\n      {\n        \"family\": \"marine\",\n        \"group\": \"seaweed\",\n        \"item\": \"brine\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"medicinal\",\n        \"item\": \"iodine\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"oak\",\n        \"item\": \"vanilla\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"sweet\",\n        \"item\": \"dark chocolate\",\n        \"intensity\": 3.0\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green\",\n        \"item\": \"mint\",\n        \"intensity\": 2.5\n      }\n    ],\n    \"texture\": [\n      \"oily\",\n      \"warming\",\n      \"medium-full\"\n    ],\n    \"finish_length\": 8.5,\n    \"complexity\": 8.5\n  },\n  \"notes\": \"Classic Islay single malt characterised by intense peat smoke, maritime brine, and medicinal iodine notes, balanced by underlying sweetness and oak influence.\",\n  \"metadata\": {\n    \"source\": \"OGS core benchmark\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:ardbeg-10-islay",
                "expected": "ogs:core:beverage:ardbeg-10-islay",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "spirit",
                "expected": "spirit",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3092,
            "output_tokens": 717,
            "latency_ms": 10313.3,
            "usd": 0.020031,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:ardbeg-10-islay\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Islay single malt\",\n  \"type\": \"spirit\",\n  \"subtype\": \"single malt Scotch whisky\",\n  \"region\": \"Islay, Scotland\",\n  \"abv\": 46.0,\n  \"serve_temperature_c\": {\n    \"min\": 15,\n    \"max\": 20\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 4.5,\n      \"sour\": 1.5,\n      \"salty\": 2.0,\n      \"bitter\": 3.5,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 7.0,\n      \"acidity_perception\": 2.0,\n      \"tannin\": 2.5,\n      \"astringency\": 2.0,\n      \"alcohol_heat\": 7.5,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"earth\",\n        \"group\": \"peat\",\n        \"item\": \"smoke\",\n        \"intensity\": 9.0\n      },\n      {\n        \"family\": \"marine\",\n        \"group\": \"seaweed\",\n        \"item\": \"brine\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"medicinal\",\n        \"item\": \"iodine\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"wood\",\n        \"group\": \"oak\",\n        \"item\": \"vanilla\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"lemon\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 3.0\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"sweet\",\n        \"item\": \"dark chocolate\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"aromatic\",\n        \"item\": \"tar\",\n        \"intensity\": 5.0\n      }\n    ],\n    \"texture\": [\n      \"oily\",\n      \"full-bodied\",\n      \"warming\"\n    ],\n    \"finish_length\": 9.0,\n    \"complexity\": 8.5\n  },\n  \"description\": \"A classic Islay single malt Scotch whisky defined by its intense phenolic peat smoke, maritime salinity, and iodine character, balanced by underlying sweetness, vanilla oak, and a long, warming finish.\",\n  \"notes\": \"Best enjoyed neat or with a small splash of still water to open up the aromatics.\",\n  \"metadata\": {\n    \"source\": \"OGS Core Beverage Registry\",\n    \"license\": \"CC BY 4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:ardbeg-10-islay",
                "expected": "ogs:core:beverage:ardbeg-10-islay",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "spirit",
                "expected": "spirit",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3092,
            "output_tokens": 758,
            "latency_ms": 11557.4,
            "usd": 0.020646,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3092.0,
        "output_tokens_mean": 753.667,
        "latency_ms_mean": 10968.467,
        "usd_sum": 0.061743
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-013",
      "category": "structured_output",
      "capability": "schema_valid_beverage",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.beverage",
        "vocab.beverage_types",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "872d23f39307a374",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS beverage schema (`schema/beverage.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/beverage.schema.json\",\n  \"title\": \"OGS Beverage\",\n  \"description\": \"A beverage entity with type, structural attributes, sensory profile, and provenance.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"type\",\n    \"sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"type\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"wine\",\n        \"beer\",\n        \"spirit\",\n        \"cider\",\n        \"sake\",\n        \"other\"\n      ]\n    },\n    \"subtype\": {\n      \"type\": \"string\"\n    },\n    \"grape_varieties\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"type\": \"string\"\n      }\n    },\n    \"vintage\": {\n      \"type\": \"integer\"\n    },\n    \"region\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"appellation\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"producer\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"abv\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"residual_sugar\": {\n      \"type\": \"number\",\n      \"minimum\": 0\n    },\n    \"serve_temperature_c\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"min\": {\n          \"type\": \"number\"\n        },\n        \"max\": {\n          \"type\": \"number\"\n        }\n      },\n      \"additionalProperties\": false,\n      \"description\": \"Recommended serving temperature range in degrees Celsius (SI canonical).\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document describing 'junmai ginjo sake' as an OGS beverage. It MUST validate against the OGS beverage schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"beverage_name\": \"junmai ginjo sake\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"junmai-ginjo-generic\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/beverage.schema.json",
        "required_fields": {
          "id": "ogs:core:beverage:junmai-ginjo-generic",
          "ogs_version": "0.1.0",
          "type": "sake"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:junmai-ginjo-generic\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"ja\"],\n  \"name\": {\n    \"en\": \"Junmai Ginjo Sake\",\n    \"ja\": \"純米吟醸酒\"\n  },\n  \"description\": {\n    \"en\": \"Junmai Ginjo is a premium class of Japanese sake brewed with rice polished to at least 60% of its original size, using only rice, water, and koji with no added alcohol. It is characterized by fragrant, fruity aromas and a clean, elegant palate.\"\n  },\n  \"type\": \"sake\",\n  \"subtype\": \"junmai ginjo\",\n  \"region\": {\n    \"en\": \"Japan\"\n  },\n  \"abv\": 15.5,\n  \"residual_sugar\": 3.5,\n  \"serve_temperature_c\": {\n    \"min\": 10,\n    \"max\": 15\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 4.5,\n      \"sour\": 3.0,\n      \"salty\": 0.5,\n      \"bitter\": 1.5,\n      \"umami\": 3.5\n    },\n    \"structural\": {\n      \"body\": 4.0,\n      \"acidity_perception\": 3.5,\n      \"tannin\": 0.0,\n      \"astringency\": 0.5,\n      \"alcohol_heat\": 3.0,\n      \"effervescence\": 0.5\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"tree fruit\",\n        \"item\": \"pear\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"tropical fruit\",\n        \"item\": \"melon\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"stone fruit\",\n        \"item\": \"peach\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"white flower\",\n        \"item\": \"white blossom\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"sweet grain\",\n        \"item\": \"rice\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"dairy\",\n        \"group\": \"cream\",\n        \"item\": \"yogurt\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 3.0\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green herb\",\n        \"item\": \"fresh grass\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 2.5\n      }\n    ],\n    \"texture\": [\n      \"smooth\",\n      \"silky\",\n      \"light-bodied\",\n      \"clean\"\n    ],\n    \"finish_length\": 5.5,\n    \"complexity\": 6.0\n  },\n  \"notes\": {\n    \"en\": \"Junmai Ginjo sake is best served lightly chilled to preserve its delicate floral and fruity aromas. It pairs well with light seafood dishes, sushi, sashimi, and mild cheeses.\"\n  },\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Beverage Catalogue\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:junmai-ginjo-generic",
                "expected": "ogs:core:beverage:junmai-ginjo-generic",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "sake",
                "expected": "sake",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3089,
            "output_tokens": 959,
            "latency_ms": 12425.3,
            "usd": 0.023652,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:junmai-ginjo-generic\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"ja\"],\n  \"name\": {\n    \"en\": \"Junmai Ginjo Sake\",\n    \"ja\": \"純米吟醸酒\"\n  },\n  \"description\": {\n    \"en\": \"A premium Japanese rice wine brewed using the ginjo method, with at least 40% of the rice polished away and no added alcohol. Known for its fragrant, fruity, and floral aromatic profile with a clean, elegant palate.\"\n  },\n  \"type\": \"sake\",\n  \"subtype\": \"junmai ginjo\",\n  \"region\": {\n    \"en\": \"Japan\"\n  },\n  \"abv\": 15.5,\n  \"residual_sugar\": 3.5,\n  \"serve_temperature_c\": {\n    \"min\": 10,\n    \"max\": 15\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 4.5,\n      \"sour\": 3.0,\n      \"salty\": 0.5,\n      \"bitter\": 1.5,\n      \"umami\": 3.5\n    },\n    \"structural\": {\n      \"body\": 4.0,\n      \"acidity_perception\": 3.5,\n      \"tannin\": 0.0,\n      \"astringency\": 1.0,\n      \"alcohol_heat\": 3.0,\n      \"effervescence\": 0.5\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"tree fruit\",\n        \"item\": \"pear\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"tropical fruit\",\n        \"item\": \"melon\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"white flowers\",\n        \"item\": \"white blossom\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"citrus\",\n        \"item\": \"yuzu\",\n        \"qualifier\": \"zest\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"sweet\",\n        \"item\": \"rice cake\",\n        \"intensity\": 3.5\n      },\n      {\n        \"family\": \"dairy\",\n        \"group\": \"cream\",\n        \"item\": \"yogurt\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 2.5\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green\",\n        \"item\": \"green grass\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 2.0\n      }\n    ],\n    \"texture\": [\"silky\", \"light\", \"clean\", \"smooth\"],\n    \"finish_length\": 5.5,\n    \"complexity\": 6.0\n  },\n  \"notes\": {\n    \"en\": \"Junmai Ginjo is brewed with rice polished to at least 60% of its original size (seimaibuai ≤60%). The absence of added distilled alcohol and the use of ginjo yeast strains at low fermentation temperatures produce the characteristic ginjo-ka fruity and floral bouquet.\"\n  },\n  \"metadata\": {\n    \"source\": \"OGS Core Beverage Reference\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:junmai-ginjo-generic",
                "expected": "ogs:core:beverage:junmai-ginjo-generic",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "sake",
                "expected": "sake",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3089,
            "output_tokens": 928,
            "latency_ms": 11895.5,
            "usd": 0.023187,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:junmai-ginjo-generic\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"en\": \"Junmai Ginjo Sake\",\n    \"ja\": \"純米吟醸酒\"\n  },\n  \"description\": {\n    \"en\": \"A premium Japanese rice wine brewed using only rice, water, koji, and yeast, with at least 40% of the rice polished away. Junmai Ginjo is known for its fragrant, fruity, and elegant character.\"\n  },\n  \"type\": \"sake\",\n  \"subtype\": \"junmai ginjo\",\n  \"region\": {\n    \"en\": \"Japan\"\n  },\n  \"abv\": 15.5,\n  \"residual_sugar\": 3.5,\n  \"serve_temperature_c\": {\n    \"min\": 10,\n    \"max\": 15\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 5.5,\n      \"sour\": 3.0,\n      \"salty\": 0.5,\n      \"bitter\": 1.5,\n      \"umami\": 3.5\n    },\n    \"structural\": {\n      \"body\": 4.5,\n      \"acidity_perception\": 3.5,\n      \"tannin\": 0.5,\n      \"astringency\": 1.0,\n      \"alcohol_heat\": 3.5,\n      \"effervescence\": 0.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"fruit\",\n        \"group\": \"tree fruit\",\n        \"item\": \"pear\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"tropical fruit\",\n        \"item\": \"melon\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"stone fruit\",\n        \"item\": \"peach\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"white flowers\",\n        \"item\": \"cherry blossom\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"white flowers\",\n        \"item\": \"jasmine\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 4.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"sweet\",\n        \"item\": \"rice cake\",\n        \"intensity\": 4.0\n      },\n      {\n        \"family\": \"savory\",\n        \"group\": \"fermented\",\n        \"item\": \"koji\",\n        \"qualifier\": \"fermented\",\n        \"intensity\": 3.0\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"green\",\n        \"item\": \"grass\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 2.5\n      }\n    ],\n    \"texture\": [\n      \"silky\",\n      \"smooth\",\n      \"light\",\n      \"clean\"\n    ],\n    \"finish_length\": 6.0,\n    \"complexity\": 6.5\n  },\n  \"metadata\": {\n    \"source\": \"OGS Core Beverage Library\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:junmai-ginjo-generic",
                "expected": "ogs:core:beverage:junmai-ginjo-generic",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "sake",
                "expected": "sake",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3089,
            "output_tokens": 890,
            "latency_ms": 16466.2,
            "usd": 0.022617,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3089.0,
        "output_tokens_mean": 925.667,
        "latency_ms_mean": 13595.667,
        "usd_sum": 0.069456
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-014",
      "category": "structured_output",
      "capability": "schema_valid_beverage",
      "difficulty": "easy",
      "cuisine_region": "german",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.beverage",
        "vocab.beverage_types",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "3fe3fe74f8d80260",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS beverage schema (`schema/beverage.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/beverage.schema.json\",\n  \"title\": \"OGS Beverage\",\n  \"description\": \"A beverage entity with type, structural attributes, sensory profile, and provenance.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"type\",\n    \"sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"type\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"wine\",\n        \"beer\",\n        \"spirit\",\n        \"cider\",\n        \"sake\",\n        \"other\"\n      ]\n    },\n    \"subtype\": {\n      \"type\": \"string\"\n    },\n    \"grape_varieties\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"type\": \"string\"\n      }\n    },\n    \"vintage\": {\n      \"type\": \"integer\"\n    },\n    \"region\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"appellation\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"producer\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"abv\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"residual_sugar\": {\n      \"type\": \"number\",\n      \"minimum\": 0\n    },\n    \"serve_temperature_c\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"min\": {\n          \"type\": \"number\"\n        },\n        \"max\": {\n          \"type\": \"number\"\n        }\n      },\n      \"additionalProperties\": false,\n      \"description\": \"Recommended serving temperature range in degrees Celsius (SI canonical).\"\n    },\n    \"sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Beverage types\n\nwine, beer, spirit, cider, sake, other\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce a JSON document describing 'Oktoberfest Märzen lager' as an OGS beverage. It MUST validate against the OGS beverage schema. Respond with ONLY the JSON object.\n\nINPUT:\n{\n  \"beverage_name\": \"Oktoberfest Märzen lager\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"oktoberfest-marzen\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/beverage.schema.json",
        "required_fields": {
          "id": "ogs:core:beverage:oktoberfest-marzen",
          "ogs_version": "0.1.0",
          "type": "beer"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:oktoberfest-marzen\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Oktoberfest Märzen lager\",\n  \"type\": \"beer\",\n  \"subtype\": \"Märzen / Oktoberfest\",\n  \"region\": \"Bavaria, Germany\",\n  \"abv\": 5.8,\n  \"serve_temperature_c\": {\n    \"min\": 7,\n    \"max\": 10\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 5.5,\n      \"sour\": 1.5,\n      \"salty\": 0.5,\n      \"bitter\": 3.5,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 6.0,\n      \"acidity_perception\": 2.0,\n      \"tannin\": 0.5,\n      \"astringency\": 1.0,\n      \"alcohol_heat\": 3.5,\n      \"effervescence\": 3.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"confection\",\n        \"group\": \"malt\",\n        \"item\": \"toasted bread\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"malt\",\n        \"item\": \"caramel\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"malt\",\n        \"item\": \"biscuit\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"grain\",\n        \"item\": \"munich malt\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"hop\",\n        \"item\": \"noble hop\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 3.0\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"hop\",\n        \"item\": \"floral hop\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 2.0\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"hop\",\n        \"item\": \"spicy hop\",\n        \"intensity\": 2.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"stone fruit\",\n        \"item\": \"dried apricot\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 2.0\n      }\n    ],\n    \"texture\": [\n      \"smooth\",\n      \"medium-bodied\",\n      \"creamy\",\n      \"clean\"\n    ],\n    \"finish_length\": 5.5,\n    \"complexity\": 5.0\n  },\n  \"description\": \"A traditional Bavarian amber lager brewed in the Märzen style, historically lagered through summer and served at Oktoberfest. It features a rich toasty malt backbone with caramel sweetness, balanced by noble hop bitterness and a clean, smooth finish.\",\n  \"metadata\": {\n    \"source\": \"OGS core dataset\",\n    \"license\": \"CC BY 4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:oktoberfest-marzen",
                "expected": "ogs:core:beverage:oktoberfest-marzen",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "beer",
                "expected": "beer",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3099,
            "output_tokens": 834,
            "latency_ms": 12266.0,
            "usd": 0.021807,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:oktoberfest-marzen\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": \"Oktoberfest Märzen lager\",\n  \"description\": \"A traditional German amber lager brewed in March (März) and lagered through summer for serving at Oktoberfest. Rich malt character with a clean, balanced finish.\",\n  \"type\": \"beer\",\n  \"subtype\": \"Märzen / Oktoberfest\",\n  \"region\": \"Bavaria, Germany\",\n  \"abv\": 5.8,\n  \"serve_temperature_c\": {\n    \"min\": 7,\n    \"max\": 10\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 4.5,\n      \"sour\": 1.5,\n      \"salty\": 0.5,\n      \"bitter\": 3.5,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 6.0,\n      \"acidity_perception\": 2.0,\n      \"tannin\": 0.5,\n      \"astringency\": 1.0,\n      \"alcohol_heat\": 3.5,\n      \"effervescence\": 4.0\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"confection\",\n        \"group\": \"malt\",\n        \"item\": \"biscuit\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"malt\",\n        \"item\": \"caramel\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"malt\",\n        \"item\": \"toasted bread\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 6.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"grain\",\n        \"item\": \"cereal\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 5.0\n      },\n      {\n        \"family\": \"floral\",\n        \"group\": \"hop\",\n        \"item\": \"noble hop blossom\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 3.0\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"hop\",\n        \"item\": \"herbal hop\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 2.5\n      },\n      {\n        \"family\": \"spice\",\n        \"group\": \"warm spice\",\n        \"item\": \"light clove\",\n        \"qualifier\": \"fresh\",\n        \"intensity\": 1.5\n      }\n    ],\n    \"texture\": [\n      \"smooth\",\n      \"medium-full body\",\n      \"creamy mouthfeel\",\n      \"clean finish\"\n    ],\n    \"finish_length\": 5.5,\n    \"complexity\": 5.5\n  },\n  \"notes\": \"Classic Bavarian Märzen style, amber-copper in colour with a persistent off-white head. Dominated by rich Munich malt sweetness, restrained noble-hop bitterness, and a clean lager character from extended cold conditioning.\",\n  \"metadata\": {\n    \"source\": \"OGS Core Beverage Library\",\n    \"license\": \"CC-BY-4.0\",\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"updated\": \"2024-01-01T00:00:00Z\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:oktoberfest-marzen",
                "expected": "ogs:core:beverage:oktoberfest-marzen",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "beer",
                "expected": "beer",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3099,
            "output_tokens": 904,
            "latency_ms": 13504.3,
            "usd": 0.022857,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:beverage:oktoberfest-marzen\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": \"Oktoberfest Märzen lager\",\n  \"description\": \"A traditional German amber lager brewed in March (März) and lagered through summer for serving at Oktoberfest. Known for its rich malt character, toasty bread notes, and clean finish.\",\n  \"type\": \"beer\",\n  \"subtype\": \"Märzen / Oktoberfest lager\",\n  \"region\": \"Bavaria, Germany\",\n  \"abv\": 5.8,\n  \"serve_temperature_c\": {\n    \"min\": 7,\n    \"max\": 10\n  },\n  \"sensory_profile\": {\n    \"basic_tastes\": {\n      \"sweet\": 5.5,\n      \"sour\": 1.5,\n      \"salty\": 0.5,\n      \"bitter\": 3.5,\n      \"umami\": 1.0\n    },\n    \"structural\": {\n      \"body\": 6.0,\n      \"acidity_perception\": 2.0,\n      \"tannin\": 0.5,\n      \"astringency\": 1.0,\n      \"alcohol_heat\": 3.5,\n      \"effervescence\": 4.5\n    },\n    \"aromatics\": [\n      {\n        \"family\": \"confection\",\n        \"group\": \"bread\",\n        \"item\": \"toasted bread\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 7.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"caramel\",\n        \"item\": \"toffee\",\n        \"qualifier\": \"cooked\",\n        \"intensity\": 6.5\n      },\n      {\n        \"family\": \"confection\",\n        \"group\": \"malt\",\n        \"item\": \"biscuit malt\",\n        \"intensity\": 7.0\n      },\n      {\n        \"family\": \"herbal\",\n        \"group\": \"hop\",\n        \"item\": \"noble hop\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 3.0\n      },\n      {\n        \"family\": \"earth\",\n        \"group\": \"grain\",\n        \"item\": \"Munich malt\",\n        \"intensity\": 5.5\n      },\n      {\n        \"family\": \"fruit\",\n        \"group\": \"stone fruit\",\n        \"item\": \"dried apricot\",\n        \"qualifier\": \"dried\",\n        \"intensity\": 2.5\n      }\n    ],\n    \"texture\": [\n      \"smooth\",\n      \"creamy\",\n      \"medium-bodied\",\n      \"clean finish\"\n    ],\n    \"finish_length\": 5.5,\n    \"complexity\": 5.5\n  },\n  \"notes\": \"Best served fresh during the autumn festival season. Pairs well with roast pork, pretzels, and aged cheeses.\",\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Beverage Catalog\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:beverage:oktoberfest-marzen",
                "expected": "ogs:core:beverage:oktoberfest-marzen",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "type": {
                "predicted": "beer",
                "expected": "beer",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 3099,
            "output_tokens": 799,
            "latency_ms": 11588.4,
            "usd": 0.021282,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 3099.0,
        "output_tokens_mean": 845.667,
        "latency_ms_mean": 12452.9,
        "usd_sum": 0.065946
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-015",
      "category": "structured_output",
      "capability": "schema_valid_pairing",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.pairing",
        "vocab.match_types",
        "vocab.explanation_codes",
        "vocab.grade_scale",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "f0c01fa20f35eb51",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS pairing schema (`schema/pairing.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/pairing.schema.json\",\n  \"title\": \"OGS Pairing\",\n  \"description\": \"A scored, explained pairing between a dish and a beverage.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"dish_ref\",\n    \"beverage_ref\",\n    \"match_types\",\n    \"overall_score\",\n    \"grade\",\n    \"explanation_codes\",\n    \"confidence\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:pairing:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"dish_ref\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:dish:[a-z0-9-]+$\"\n    },\n    \"beverage_ref\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"match_types\": {\n      \"type\": \"array\",\n      \"minItems\": 1,\n      \"items\": {\n        \"$ref\": \"#/$defs/match_type_entry\"\n      }\n    },\n    \"overall_score\": {\n      \"type\": \"integer\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"grade\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"A+\",\n        \"A\",\n        \"B+\",\n        \"B\",\n        \"C+\",\n        \"C\",\n        \"D\",\n        \"F\"\n      ]\n    },\n    \"explanation_codes\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/explanation_code_entry\"\n      }\n    },\n    \"risks\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/risk_entry\"\n      }\n    },\n    \"confidence\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 1\n    },\n    \"evidence\": {\n      \"$ref\": \"#/$defs/evidence\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"match_type_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"type\",\n        \"score\"\n      ],\n      \"properties\": {\n        \"type\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"harmony\",\n            \"contrast\",\n            \"bridge\",\n            \"intensity_alignment\",\n            \"structural_balance\",\n            \"refresh\"\n          ]\n        },\n        \"score\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 100\n        },\n        \"description\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"primary\": {\n          \"type\": \"boolean\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"explanation_code_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"code\"\n      ],\n      \"properties\": {\n        \"code\": {\n          \"type\": \"string\"\n        },\n        \"weight\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 1\n        },\n        \"detail\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"risk_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"code\",\n        \"severity\"\n      ],\n      \"properties\": {\n        \"code\": {\n          \"type\": \"string\"\n        },\n        \"severity\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"low\",\n            \"medium\",\n            \"high\"\n          ]\n        },\n        \"description\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"mitigation\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"evidence\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"method\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"expert_panel\",\n            \"instrumental\",\n            \"computed\",\n            \"estimated\"\n          ]\n        },\n        \"sources\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n### OGS Explanation Codes\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce an OGS pairing JSON document for the described match. It MUST validate against the OGS pairing schema. Respond with ONLY the JSON.\n\nINPUT:\n{\n  \"pairing_slug\": \"crottin-sancerre\",\n  \"dish_ref\": \"ogs:core:dish:crottin-de-chavignol\",\n  \"beverage_ref\": \"ogs:core:beverage:sancerre-blanc\",\n  \"intent\": \"Classic Loire regional pairing: Sancerre's acidity and citrus meet fresh goat cheese's tang.\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/pairing.schema.json",
        "required_fields": {
          "id": "ogs:core:pairing:crottin-sancerre",
          "ogs_version": "0.1.0",
          "dish_ref": "ogs:core:dish:crottin-de-chavignol",
          "beverage_ref": "ogs:core:beverage:sancerre-blanc"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:crottin-sancerre\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Crottin de Chavignol & Sancerre Blanc\",\n  \"dish_ref\": \"ogs:core:dish:crottin-de-chavignol\",\n  \"beverage_ref\": \"ogs:core:beverage:sancerre-blanc\",\n  \"match_types\": [\n    {\n      \"type\": \"harmony\",\n      \"score\": 92,\n      \"primary\": true,\n      \"description\": \"Regional affinity and shared flavor profiles create a seamless, classic pairing.\"\n    },\n    {\n      \"type\": \"refresh\",\n      \"score\": 88,\n      \"primary\": false,\n      \"description\": \"Sancerre's bright acidity cuts through the creamy, tangy richness of the goat cheese.\"\n    },\n    {\n      \"type\": \"bridge\",\n      \"score\": 85,\n      \"primary\": false,\n      \"description\": \"Shared herbal and mineral notes bridge the cheese and wine aromatically.\"\n    }\n  ],\n  \"overall_score\": 93,\n  \"grade\": \"A\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"HARMONY_REGIONAL\",\n      \"weight\": 0.35,\n      \"detail\": \"Both originate from the Loire Valley, reflecting a classic terroir-driven compatibility between Crottin de Chavignol and Sancerre Blanc.\"\n    },\n    {\n      \"code\": \"REFRESH_ACID\",\n      \"weight\": 0.25,\n      \"detail\": \"Sancerre's characteristically high acidity refreshes the palate after each bite of rich, tangy goat cheese.\"\n    },\n    {\n      \"code\": \"STRUCT_ACIDITY_BALANCE\",\n      \"weight\": 0.20,\n      \"detail\": \"The wine's acidity mirrors and complements the natural tang of the fresh goat cheese, creating a balanced interaction.\"\n    },\n    {\n      \"code\": \"AROMA_SHARED_HERBAL\",\n      \"weight\": 0.10,\n      \"detail\": \"Herbaceous and grassy notes appear in both the Sauvignon Blanc-based wine and the goat cheese, creating an aromatic bridge.\"\n    },\n    {\n      \"code\": \"REFRESH_MINERAL\",\n      \"weight\": 0.10,\n      \"detail\": \"Sancerre's flint and mineral character provides a clean, palate-resetting counterpoint to the cheese's creaminess.\"\n    }\n  ],\n  \"risks\": [\n    {\n      \"code\": \"RISK_ACID_CLASH\",\n      \"severity\": \"low\",\n      \"description\": \"If the goat cheese is particularly acidic and the wine has aggressive acidity, the combination could tip toward sharpness.\",\n      \"mitigation\": \"Choose a Sancerre from a riper vintage to ensure the acidity is balanced by fruit weight.\"\n    }\n  ],\n  \"confidence\": 0.95,\n  \"evidence\": {\n    \"method\": \"expert_panel\",\n    \"notes\": \"This is one of the most celebrated and well-documented regional pairings in French gastronomy, consistently praised by sommeliers and food writers.\"\n  },\n  \"description\": \"A quintessential Loire Valley pairing: the bright citrus, gooseberry, and mineral-driven Sancerre Blanc meets the fresh, lactic tang and creamy texture of Crottin de Chavignol. The wine's vivid acidity harmonizes with the cheese's tanginess while its herbal and flint notes echo the terroir both share.\",\n  \"metadata\": {\n    \"source\": \"OGS Core Pairings\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:pairing:crottin-sancerre",
                "expected": "ogs:core:pairing:crottin-sancerre",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:crottin-de-chavignol",
                "expected": "ogs:core:dish:crottin-de-chavignol",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:sancerre-blanc",
                "expected": "ogs:core:beverage:sancerre-blanc",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 4204,
            "output_tokens": 929,
            "latency_ms": 12928.5,
            "usd": 0.026547,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:crottin-sancerre\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Crottin de Chavignol & Sancerre Blanc\",\n  \"dish_ref\": \"ogs:core:dish:crottin-de-chavignol\",\n  \"beverage_ref\": \"ogs:core:beverage:sancerre-blanc\",\n  \"match_types\": [\n    {\n      \"type\": \"harmony\",\n      \"score\": 92,\n      \"primary\": true,\n      \"description\": \"Regional affinity and flavor convergence between Loire goat cheese and Sancerre Sauvignon Blanc.\"\n    },\n    {\n      \"type\": \"refresh\",\n      \"score\": 88,\n      \"primary\": false,\n      \"description\": \"Sancerre's bright acidity and mineral character cleanse the palate of the cheese's creamy tang.\"\n    },\n    {\n      \"type\": \"bridge\",\n      \"score\": 85,\n      \"primary\": false,\n      \"description\": \"Shared herbal and citrus aromatics link the grassy notes of Sauvignon Blanc with the fresh, lactic character of the cheese.\"\n    }\n  ],\n  \"overall_score\": 92,\n  \"grade\": \"A\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"HARMONY_REGIONAL\",\n      \"weight\": 0.30,\n      \"detail\": \"Both Crottin de Chavignol and Sancerre originate from the Loire Valley, reflecting a classic terroir-driven pairing.\"\n    },\n    {\n      \"code\": \"STRUCT_ACIDITY_BALANCE\",\n      \"weight\": 0.25,\n      \"detail\": \"Sancerre's crisp acidity cuts through the creamy richness of the goat cheese, restoring freshness with each sip.\"\n    },\n    {\n      \"code\": \"AROMA_SHARED_HERBAL\",\n      \"weight\": 0.20,\n      \"detail\": \"The grassy, herbal qualities of Sauvignon Blanc echo the fresh, herbaceous notes found in Crottin de Chavignol.\"\n    },\n    {\n      \"code\": \"REFRESH_MINERAL\",\n      \"weight\": 0.15,\n      \"detail\": \"Sancerre's characteristic flinty minerality resets the palate between bites of the tangy cheese.\"\n    },\n    {\n      \"code\": \"REFRESH_ACID\",\n      \"weight\": 0.10,\n      \"detail\": \"High acidity in the wine cuts through the fat and lactic tang of the goat cheese, keeping the pairing lively.\"\n    }\n  ],\n  \"risks\": [\n    {\n      \"code\": \"RISK_ACID_CLASH\",\n      \"severity\": \"low\",\n      \"description\": \"If the cheese is very young and highly acidic, the combined acidity may be slightly sharp for some palates.\",\n      \"mitigation\": \"Opt for a Sancerre with moderate residual minerality and serve the cheese at room temperature to mellow its tang.\"\n    }\n  ],\n  \"confidence\": 0.95,\n  \"evidence\": {\n    \"method\": \"expert_panel\",\n    \"sources\": [\n      \"Traditional Loire Valley pairing canon\",\n      \"French cheese and wine pairing literature\"\n    ],\n    \"notes\": \"One of the most celebrated classic regional pairings in French gastronomy.\"\n  },\n  \"description\": \"A quintessential Loire Valley pairing: the bright citrus and grassy aromatics of Sancerre Blanc harmonise beautifully with the lactic tang and creamy texture of Crottin de Chavignol, while the wine's vivid acidity and mineral backbone refresh the palate with every bite.\",\n  \"metadata\": {\n    \"created\": \"2024-01-01T00:00:00Z\",\n    \"source\": \"OGS Core Pairings\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:pairing:crottin-sancerre",
                "expected": "ogs:core:pairing:crottin-sancerre",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:crottin-de-chavignol",
                "expected": "ogs:core:dish:crottin-de-chavignol",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:sancerre-blanc",
                "expected": "ogs:core:beverage:sancerre-blanc",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 4204,
            "output_tokens": 982,
            "latency_ms": 13401.5,
            "usd": 0.027342,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:crottin-sancerre\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Crottin de Chavignol & Sancerre Blanc\",\n  \"dish_ref\": \"ogs:core:dish:crottin-de-chavignol\",\n  \"beverage_ref\": \"ogs:core:beverage:sancerre-blanc\",\n  \"match_types\": [\n    {\n      \"type\": \"harmony\",\n      \"score\": 92,\n      \"primary\": true,\n      \"description\": \"Regional affinity and flavor convergence between Loire goat cheese and Sancerre Sauvignon Blanc.\"\n    },\n    {\n      \"type\": \"refresh\",\n      \"score\": 88,\n      \"primary\": false,\n      \"description\": \"Sancerre's high acidity and mineral character cleanse and refresh the palate after each bite of rich, tangy cheese.\"\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 85,\n      \"primary\": false,\n      \"description\": \"Acidity in the wine balances the fat and creaminess of the goat cheese.\"\n    }\n  ],\n  \"overall_score\": 93,\n  \"grade\": \"A\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"HARMONY_REGIONAL\",\n      \"weight\": 0.35,\n      \"detail\": \"Both Crottin de Chavignol and Sancerre Blanc originate from the Loire Valley, sharing a terroir-driven compatibility that makes this a textbook regional pairing.\"\n    },\n    {\n      \"code\": \"HARMONY_FLAVOR_CONVERGE\",\n      \"weight\": 0.25,\n      \"detail\": \"The citrus and herbal notes of Sauvignon Blanc mirror the bright, tangy flavor profile of fresh goat cheese, creating a unified taste experience.\"\n    },\n    {\n      \"code\": \"STRUCT_ACIDITY_BALANCE\",\n      \"weight\": 0.20,\n      \"detail\": \"Sancerre's vibrant acidity cuts through the creaminess of the cheese and matches its inherent tanginess, maintaining freshness throughout the pairing.\"\n    },\n    {\n      \"code\": \"REFRESH_MINERAL\",\n      \"weight\": 0.10,\n      \"detail\": \"The flint and chalk mineral character of Sancerre provides a clean, palate-resetting effect between bites of cheese.\"\n    },\n    {\n      \"code\": \"AROMA_SHARED_HERBAL\",\n      \"weight\": 0.10,\n      \"detail\": \"Grassy, herbal aromatics present in both Sauvignon Blanc and fresh goat cheese create an aromatic continuity.\"\n    }\n  ],\n  \"risks\": [\n    {\n      \"code\": \"RISK_ACID_CLASH\",\n      \"severity\": \"low\",\n      \"description\": \"If the Crottin is very young and highly acidic, the combined acidity with the wine could become sharp.\",\n      \"mitigation\": \"Choose a Crottin at a slightly more mature stage (mi-sec) to temper its acidity and increase creaminess.\"\n    }\n  ],\n  \"confidence\": 0.95,\n  \"evidence\": {\n    \"method\": \"expert_panel\",\n    \"notes\": \"One of the most celebrated and well-documented regional pairings in French gastronomy, consistently validated by sommeliers and food writers.\"\n  },\n  \"description\": \"A classic Loire Valley pairing where the bright citrus, mineral, and herbal notes of Sancerre Blanc find their perfect counterpart in the tangy, earthy richness of Crottin de Chavignol. The wine's acidity lifts the cheese's creaminess while their shared regional terroir creates an elegant, seamless harmony.\",\n  \"metadata\": {\n    \"source\": \"OGS Core Pairings Library\",\n    \"license\": \"CC-BY-4.0\"\n  }\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "schema_valid": true,
            "schema_errors": [],
            "consistent": true,
            "required_fields": {
              "id": {
                "predicted": "ogs:core:pairing:crottin-sancerre",
                "expected": "ogs:core:pairing:crottin-sancerre",
                "match": true
              },
              "ogs_version": {
                "predicted": "0.1.0",
                "expected": "0.1.0",
                "match": true
              },
              "dish_ref": {
                "predicted": "ogs:core:dish:crottin-de-chavignol",
                "expected": "ogs:core:dish:crottin-de-chavignol",
                "match": true
              },
              "beverage_ref": {
                "predicted": "ogs:core:beverage:sancerre-blanc",
                "expected": "ogs:core:beverage:sancerre-blanc",
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 4204,
            "output_tokens": 973,
            "latency_ms": 13662.4,
            "usd": 0.027207,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 4204.0,
        "output_tokens_mean": 961.333,
        "latency_ms_mean": 13330.8,
        "usd_sum": 0.081096
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-016",
      "category": "structured_output",
      "capability": "schema_valid_dish",
      "difficulty": "hard",
      "cuisine_region": "japanese",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.dish",
        "vocab.component_roles",
        "vocab.cooking_methods",
        "vocab.courses",
        "vocab.service_temperatures",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "284b25ae8f0dc3ef",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS dish schema (`schema/dish.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/dish.schema.json\",\n  \"title\": \"OGS Dish\",\n  \"description\": \"A composed dish with components, cooking methods, and overall sensory profile.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"components\",\n    \"overall_sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:dish:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"cuisine\": {\n      \"type\": \"string\",\n      \"description\": \"Free-text cuisine name (v0.1) OR a cuisine entity reference matching ogs:<ns>:cuisine:<id> (v0.2+).\"\n    },\n    \"course\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"amuse\",\n        \"appetizer\",\n        \"soup\",\n        \"salad\",\n        \"fish\",\n        \"main\",\n        \"cheese\",\n        \"dessert\",\n        \"petit_four\"\n      ]\n    },\n    \"service_temperature\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"frozen\",\n        \"cold\",\n        \"cool\",\n        \"room\",\n        \"warm\",\n        \"hot\",\n        \"very_hot\"\n      ]\n    },\n    \"components\": {\n      \"type\": \"array\",\n      \"minItems\": 1,\n      \"items\": {\n        \"$ref\": \"#/$defs/component\"\n      }\n    },\n    \"overall_sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"component\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"role\",\n        \"ingredient_ref\"\n      ],\n      \"properties\": {\n        \"role\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"main_protein\",\n            \"main_vegetable\",\n            \"starch\",\n            \"sauce\",\n            \"fat_component\",\n            \"acid_component\",\n            \"aromatic_garnish\",\n            \"seasoning\",\n            \"textural_element\"\n          ]\n        },\n        \"ingredient_ref\": {\n          \"type\": \"string\",\n          \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\"\n        },\n        \"cooking_method\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"raw\",\n            \"seared\",\n            \"grilled\",\n            \"roasted\",\n            \"braised\",\n            \"poached\",\n            \"steamed\",\n            \"fried\",\n            \"confit\",\n            \"smoked\",\n            \"fermented\",\n            \"cured\",\n            \"pickled\",\n            \"dehydrated\",\n            \"sous_vide\",\n            \"blanched\",\n            \"sauteed\",\n            \"baked\",\n            \"charred\",\n            \"emulsified\"\n          ]\n        },\n        \"maillard_index\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 5\n        },\n        \"reduction_level\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 5\n        },\n        \"proportion\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"dominant\",\n            \"significant\",\n            \"accent\",\n            \"trace\"\n          ]\n        },\n        \"sensory_contribution\": {\n          \"type\": \"object\",\n          \"properties\": {\n            \"basic_tastes\": {\n              \"$ref\": \"#/$defs/basic_tastes\"\n            },\n            \"aromatics\": {\n              \"type\": \"array\",\n              \"items\": {\n                \"$ref\": \"#/$defs/aromatic_entry\"\n              }\n            },\n            \"texture\": {\n              \"type\": \"array\",\n              \"items\": {\n                \"type\": \"string\"\n              }\n            }\n          },\n          \"additionalProperties\": true\n        },\n        \"notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Service temperatures\n\nfrozen, cold, cool, room, warm, hot, very_hot\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce an OGS dish JSON for tonkotsu ramen (pork-bone broth, wheat noodles, chashu, soft egg, nori, scallion). Respond with ONLY the JSON.\n\nINPUT:\n{\n  \"dish_name\": \"tonkotsu ramen\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"tonkotsu-ramen\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/dish.schema.json",
        "required_fields": {
          "id": "ogs:core:dish:tonkotsu-ramen",
          "ogs_version": "0.1.0",
          "course": "main"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:tonkotsu-ramen\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": {\n    \"en\": \"Tonkotsu Ramen\",\n    \"ja\": \"豚骨ラーメン\"\n  },\n  \"description\": {\n    \"en\": \"A rich, creamy Japanese ramen featuring a long-simmered pork-bone broth, springy wheat noodles, braised chashu pork belly, a soft-boiled soy-marinated egg, nori, and scallion.\"\n  },\n  \"cuisine\": \"ogs:core:cuisine:japanese\",\n  \"course\": \"main\",\n  \"service_temperature\": \"very_hot\",\n  \"components\": [\n    {\n      \"role\": \"sauce\",\n      \"ingredient_ref\": \"ogs:core:ingredient:pork-bone-broth\",\n      \"cooking_method\": \"braised\",\n      \"maillard_index\": 1,\n      \"reduction_level\": 4,\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"salty\": 6,\n          \"umami\": 9,\n          \"sweet\": 2\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"savory\",\n            \"group\": \"meaty\",\n            \"item\": \"pork bone\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 9\n          },\n          {\n            \"family\": \"dairy\",\n            \"group\": \"creamy\",\n            \"item\": \"collagen richness\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 7\n          }\n        ],\n        \"texture\": [\"rich\", \"creamy\", \"viscous\"]\n      },\n      \"notes\": {\n        \"en\": \"Simmered for 12+ hours to extract collagen and fat, yielding an opaque, milky broth.\"\n      }\n    },\n    {\n      \"role\": \"starch\",\n      \"ingredient_ref\": \"ogs:core:ingredient:wheat-ramen-noodles\",\n      \"cooking_method\": \"blanched\",\n      \"maillard_index\": 0,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"salty\": 2,\n          \"umami\": 1\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"grain\",\n            \"item\": \"wheat\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 3\n          }\n        ],\n        \"texture\": [\"springy\", \"chewy\", \"firm\"]\n      },\n      \"notes\": {\n        \"en\": \"Alkaline wheat noodles (kansui) with a characteristic springy bite.\"\n      }\n    },\n    {\n      \"role\": \"main_protein\",\n      \"ingredient_ref\": \"ogs:core:ingredient:chashu-pork-belly\",\n      \"cooking_method\": \"braised\",\n      \"maillard_index\": 3,\n      \"reduction_level\": 2,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"salty\": 5,\n          \"sweet\": 4,\n          \"umami\": 7\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"savory\",\n            \"group\": \"meaty\",\n            \"item\": \"braised pork\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 7\n          },\n          {\n            \"family\": \"spice\",\n            \"group\": \"warm spice\",\n            \"item\": \"soy-mirin glaze\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 4\n          }\n        ],\n        \"texture\": [\"tender\", \"melt-in-mouth\", \"fatty\"]\n      },\n      \"notes\": {\n        \"en\": \"Rolled and tied pork belly braised in soy, mirin, sake, and sugar until deeply tender.\"\n      }\n    },\n    {\n      \"",
          "compliant": false,
          "score": 0.0,
          "detail": {
            "error": "no_json"
          },
          "metrics": {
            "input_tokens": 3753,
            "output_tokens": 1024,
            "latency_ms": 14169.5,
            "usd": 0.026619,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:tonkotsu-ramen\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"ja\"],\n  \"name\": {\n    \"en\": \"Tonkotsu Ramen\",\n    \"ja\": \"豚骨ラーメン\"\n  },\n  \"description\": {\n    \"en\": \"A rich, creamy pork-bone broth ramen featuring wheat noodles, braised chashu pork belly, soft-boiled soy-marinated egg, nori, and fresh scallion.\"\n  },\n  \"cuisine\": \"ogs:core:cuisine:japanese\",\n  \"course\": \"main\",\n  \"service_temperature\": \"hot\",\n  \"components\": [\n    {\n      \"role\": \"sauce\",\n      \"ingredient_ref\": \"ogs:core:ingredient:pork-bone-broth\",\n      \"cooking_method\": \"braised\",\n      \"maillard_index\": 1,\n      \"reduction_level\": 4,\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"salty\": 6,\n          \"umami\": 9,\n          \"sweet\": 2\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"savory\",\n            \"group\": \"meat\",\n            \"item\": \"pork bone\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 9\n          },\n          {\n            \"family\": \"dairy\",\n            \"group\": \"creamy\",\n            \"item\": \"collagen-rich stock\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 7\n          }\n        ],\n        \"texture\": [\"rich\", \"viscous\", \"coating\"]\n      },\n      \"notes\": {\n        \"en\": \"Long-simmered pork-bone broth, opaque and creamy due to emulsified collagen and fat.\"\n      }\n    },\n    {\n      \"role\": \"starch\",\n      \"ingredient_ref\": \"ogs:core:ingredient:wheat-ramen-noodles\",\n      \"cooking_method\": \"blanched\",\n      \"maillard_index\": 0,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"salty\": 2,\n          \"umami\": 2\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"grain\",\n            \"item\": \"wheat\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 3\n          }\n        ],\n        \"texture\": [\"springy\", \"chewy\", \"firm\"]\n      },\n      \"notes\": {\n        \"en\": \"Thin alkaline wheat noodles, springy and slightly chewy.\"\n      }\n    },\n    {\n      \"role\": \"main_protein\",\n      \"ingredient_ref\": \"ogs:core:ingredient:chashu-pork-belly\",\n      \"cooking_method\": \"braised\",\n      \"maillard_index\": 3,\n      \"reduction_level\": 2,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"salty\": 5,\n          \"sweet\": 4,\n          \"umami\": 7\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"savory\",\n            \"group\": \"meat\",\n            \"item\": \"pork belly\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 7\n          },\n          {\n            \"family\": \"spice\",\n            \"group\": \"warm\",\n            \"item\": \"soy-mirin glaze\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 5\n          }\n        ],\n        \"texture\": [\"tender\", \"melt-in-mouth\", \"fatty\"]\n      },\n      \"notes\": {\n        \"en\": \"Rolled pork belly simmered in soy, mirin, sake and sugar; sliced and often torched for a caramelised finish.\"\n      }\n    },\n    {",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "['en', 'ja'] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:dish:tonkotsu-ramen",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              },
              "course": {
                "predicted": null,
                "expected": "main",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 3753,
            "output_tokens": 1023,
            "latency_ms": 14014.2,
            "usd": 0.026604,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:tonkotsu-ramen\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"available_languages\": [\"en\", \"ja\"],\n  \"name\": {\n    \"en\": \"Tonkotsu Ramen\",\n    \"ja\": \"豚骨ラーメン\"\n  },\n  \"description\": {\n    \"en\": \"A rich, creamy pork-bone broth ramen featuring wheat noodles, braised chashu pork belly, soft-boiled soy-marinated egg, nori, and scallion.\"\n  },\n  \"cuisine\": \"ogs:core:cuisine:japanese\",\n  \"course\": \"main\",\n  \"service_temperature\": \"hot\",\n  \"components\": [\n    {\n      \"role\": \"sauce\",\n      \"ingredient_ref\": \"ogs:core:ingredient:pork-bone-broth\",\n      \"cooking_method\": \"braised\",\n      \"maillard_index\": 1,\n      \"reduction_level\": 4,\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"salty\": 6.5,\n          \"umami\": 9,\n          \"sweet\": 2,\n          \"bitter\": 0.5,\n          \"sour\": 0.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"savory\",\n            \"group\": \"broth\",\n            \"item\": \"pork bone\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 9\n          },\n          {\n            \"family\": \"dairy\",\n            \"group\": \"fat\",\n            \"item\": \"rendered pork fat\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 7\n          }\n        ],\n        \"texture\": [\"creamy\", \"viscous\", \"coating\"]\n      }\n    },\n    {\n      \"role\": \"starch\",\n      \"ingredient_ref\": \"ogs:core:ingredient:wheat-ramen-noodles\",\n      \"cooking_method\": \"boiled\",\n      \"maillard_index\": 0,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"salty\": 2,\n          \"umami\": 1,\n          \"sweet\": 1,\n          \"bitter\": 0,\n          \"sour\": 0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"grain\",\n            \"item\": \"wheat\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 3\n          }\n        ],\n        \"texture\": [\"chewy\", \"springy\", \"silky\"]\n      }\n    },\n    {\n      \"role\": \"main_protein\",\n      \"ingredient_ref\": \"ogs:core:ingredient:chashu-pork-belly\",\n      \"cooking_method\": \"braised\",\n      \"maillard_index\": 3,\n      \"reduction_level\": 2,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"salty\": 6,\n          \"umami\": 8,\n          \"sweet\": 4,\n          \"bitter\": 1,\n          \"sour\": 0.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"savory\",\n            \"group\": \"meat\",\n            \"item\": \"pork belly\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 8\n          },\n          {\n            \"family\": \"spice\",\n            \"group\": \"warm spice\",\n            \"item\": \"soy sauce caramel\",\n            \"qualifier\": \"cooked\",\n            \"intensity\": 5\n          }\n        ],\n        \"texture\": [\"tender\", \"unctuous\", \"melt-in-mouth\"]\n      }\n    },\n    {\n      \"role\": \"textural_element\",\n      \"ingredient_ref\": \"ogs:core:ingredient:soft-boiled-soy-egg\",\n      \"cooking_method\": \"poached\",\n      \"maill",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "['en', 'ja'] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:dish:tonkotsu-ramen",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              },
              "course": {
                "predicted": null,
                "expected": "main",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 3753,
            "output_tokens": 1024,
            "latency_ms": 13126.3,
            "usd": 0.026619,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 0.6667,
      "metrics": {
        "input_tokens_mean": 3753.0,
        "output_tokens_mean": 1023.667,
        "latency_ms_mean": 13770.0,
        "usd_sum": 0.079842
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-017",
      "category": "structured_output",
      "capability": "schema_valid_dish",
      "difficulty": "medium",
      "cuisine_region": "mexican",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.dish",
        "vocab.component_roles",
        "vocab.cooking_methods",
        "vocab.courses",
        "vocab.service_temperatures",
        "vocab.aromatic_families",
        "scales.basic_tastes",
        "scales.structural",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "534d8590759d6dc3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS dish schema (`schema/dish.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/dish.schema.json\",\n  \"title\": \"OGS Dish\",\n  \"description\": \"A composed dish with components, cooking methods, and overall sensory profile.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"components\",\n    \"overall_sensory_profile\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:dish:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"cuisine\": {\n      \"type\": \"string\",\n      \"description\": \"Free-text cuisine name (v0.1) OR a cuisine entity reference matching ogs:<ns>:cuisine:<id> (v0.2+).\"\n    },\n    \"course\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"amuse\",\n        \"appetizer\",\n        \"soup\",\n        \"salad\",\n        \"fish\",\n        \"main\",\n        \"cheese\",\n        \"dessert\",\n        \"petit_four\"\n      ]\n    },\n    \"service_temperature\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"frozen\",\n        \"cold\",\n        \"cool\",\n        \"room\",\n        \"warm\",\n        \"hot\",\n        \"very_hot\"\n      ]\n    },\n    \"components\": {\n      \"type\": \"array\",\n      \"minItems\": 1,\n      \"items\": {\n        \"$ref\": \"#/$defs/component\"\n      }\n    },\n    \"overall_sensory_profile\": {\n      \"$ref\": \"#/$defs/sensory_profile\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"component\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"role\",\n        \"ingredient_ref\"\n      ],\n      \"properties\": {\n        \"role\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"main_protein\",\n            \"main_vegetable\",\n            \"starch\",\n            \"sauce\",\n            \"fat_component\",\n            \"acid_component\",\n            \"aromatic_garnish\",\n            \"seasoning\",\n            \"textural_element\"\n          ]\n        },\n        \"ingredient_ref\": {\n          \"type\": \"string\",\n          \"pattern\": \"^ogs:[a-z0-9-]+:ingredient:[a-z0-9-]+$\"\n        },\n        \"cooking_method\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"raw\",\n            \"seared\",\n            \"grilled\",\n            \"roasted\",\n            \"braised\",\n            \"poached\",\n            \"steamed\",\n            \"fried\",\n            \"confit\",\n            \"smoked\",\n            \"fermented\",\n            \"cured\",\n            \"pickled\",\n            \"dehydrated\",\n            \"sous_vide\",\n            \"blanched\",\n            \"sauteed\",\n            \"baked\",\n            \"charred\",\n            \"emulsified\"\n          ]\n        },\n        \"maillard_index\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 5\n        },\n        \"reduction_level\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 5\n        },\n        \"proportion\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"dominant\",\n            \"significant\",\n            \"accent\",\n            \"trace\"\n          ]\n        },\n        \"sensory_contribution\": {\n          \"type\": \"object\",\n          \"properties\": {\n            \"basic_tastes\": {\n              \"$ref\": \"#/$defs/basic_tastes\"\n            },\n            \"aromatics\": {\n              \"type\": \"array\",\n              \"items\": {\n                \"$ref\": \"#/$defs/aromatic_entry\"\n              }\n            },\n            \"texture\": {\n              \"type\": \"array\",\n              \"items\": {\n                \"type\": \"string\"\n              }\n            }\n          },\n          \"additionalProperties\": true\n        },\n        \"notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"sensory_profile\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"basic_tastes\": {\n          \"$ref\": \"#/$defs/basic_tastes\"\n        },\n        \"structural\": {\n          \"$ref\": \"#/$defs/structural\"\n        },\n        \"aromatics\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"$ref\": \"#/$defs/aromatic_entry\"\n          }\n        },\n        \"texture\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"finish_length\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        },\n        \"complexity\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 10\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"basic_tastes\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"sweet\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"sour\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"salty\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"bitter\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"umami\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"structural\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"body\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"acidity_perception\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"tannin\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"astringency\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"alcohol_heat\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        },\n        \"effervescence\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"intensity_value\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 10\n    },\n    \"aromatic_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"family\",\n        \"group\",\n        \"item\",\n        \"intensity\"\n      ],\n      \"properties\": {\n        \"family\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fruit\",\n            \"floral\",\n            \"herbal\",\n            \"spice\",\n            \"earth\",\n            \"wood\",\n            \"dairy\",\n            \"savory\",\n            \"confection\",\n            \"vegetal\",\n            \"marine\"\n          ]\n        },\n        \"group\": {\n          \"type\": \"string\"\n        },\n        \"item\": {\n          \"type\": \"string\"\n        },\n        \"qualifier\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"fresh\",\n            \"dried\",\n            \"cooked\",\n            \"candied\",\n            \"fermented\",\n            \"zest\",\n            \"overripe\"\n          ]\n        },\n        \"intensity\": {\n          \"$ref\": \"#/$defs/intensity_value\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n### Dish courses\n\namuse, appetizer, soup, salad, fish, main, cheese, dessert, petit_four\n\n### Service temperatures\n\nfrozen, cold, cool, room, warm, hot, very_hot\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce an OGS dish JSON for pico de gallo (tomato, onion, cilantro, jalapeño, lime). JSON only.\n\nINPUT:\n{\n  \"dish_name\": \"pico de gallo\",\n  \"namespace\": \"core\",\n  \"id_slug\": \"pico-de-gallo\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/dish.schema.json",
        "required_fields": {
          "id": "ogs:core:dish:pico-de-gallo",
          "ogs_version": "0.1.0"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:pico-de-gallo\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Pico de Gallo\",\n  \"description\": \"A fresh Mexican salsa made with diced tomato, onion, cilantro, jalapeño, and lime juice.\",\n  \"cuisine\": \"Mexican\",\n  \"course\": \"appetizer\",\n  \"service_temperature\": \"cold\",\n  \"components\": [\n    {\n      \"role\": \"main_vegetable\",\n      \"ingredient_ref\": \"ogs:core:ingredient:tomato\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 4.0,\n          \"sour\": 3.5,\n          \"salty\": 0.5,\n          \"bitter\": 0.5,\n          \"umami\": 2.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"fresh_vegetable\",\n            \"item\": \"tomato\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 6.0\n          },\n          {\n            \"family\": \"fruit\",\n            \"group\": \"soft_fruit\",\n            \"item\": \"tomato_fruit\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 4.0\n          }\n        ],\n        \"texture\": [\"juicy\", \"crisp\", \"fleshy\"]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:white-onion\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 2.0,\n          \"sour\": 0.5,\n          \"salty\": 0.0,\n          \"bitter\": 1.0,\n          \"umami\": 1.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"allium\",\n            \"item\": \"raw_onion\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 6.5\n          }\n        ],\n        \"texture\": [\"crisp\", \"crunchy\"]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:cilantro\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 0.5,\n          \"sour\": 0.0,\n          \"salty\": 0.0,\n          \"bitter\": 1.5,\n          \"umami\": 0.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"fresh_herb\",\n            \"item\": \"cilantro\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 7.0\n          }\n        ],\n        \"texture\": [\"delicate\", \"leafy\"]\n      }\n    },\n    {\n      \"role\": \"seasoning\",\n      \"ingredient_ref\": \"ogs:core:ingredient:jalapeno\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 1.0,\n          \"sour\": 0.5,\n          \"salty\": 0.0,\n          \"bitter\": 1.0,\n          \"umami\": 0.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"spice\",\n            \"group\": \"chili\",\n            \"item\": \"jalapeno\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 5.5\n          },\n          {",
          "compliant": false,
          "score": 0.0,
          "detail": {
            "error": "no_json"
          },
          "metrics": {
            "input_tokens": 3736,
            "output_tokens": 1023,
            "latency_ms": 12202.1,
            "usd": 0.026553,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:pico-de-gallo\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Pico de Gallo\",\n  \"description\": \"A fresh Mexican salsa made with diced tomato, white onion, cilantro, jalapeño, and lime juice. Served raw and uncooked.\",\n  \"cuisine\": \"Mexican\",\n  \"course\": \"appetizer\",\n  \"service_temperature\": \"cold\",\n  \"components\": [\n    {\n      \"role\": \"main_vegetable\",\n      \"ingredient_ref\": \"ogs:core:ingredient:tomato\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 4.0,\n          \"sour\": 3.5,\n          \"salty\": 0.5,\n          \"umami\": 3.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"nightshade\",\n            \"item\": \"fresh tomato\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 6.0\n          }\n        ],\n        \"texture\": [\"juicy\", \"soft\", \"chunky\"]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:white-onion\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 2.0,\n          \"bitter\": 1.5,\n          \"salty\": 0.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"allium\",\n            \"item\": \"raw onion\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 5.5\n          }\n        ],\n        \"texture\": [\"crisp\", \"crunchy\"]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:cilantro\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"bitter\": 1.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"soft herb\",\n            \"item\": \"cilantro\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 6.5\n          }\n        ],\n        \"texture\": [\"tender\"]\n      }\n    },\n    {\n      \"role\": \"seasoning\",\n      \"ingredient_ref\": \"ogs:core:ingredient:jalapeno\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"bitter\": 1.0,\n          \"salty\": 0.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"pepper\",\n            \"item\": \"fresh jalapeño\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 5.0\n          }\n        ],\n        \"texture\": [\"crisp\", \"firm\"]\n      }\n    },\n    {\n      \"role\": \"acid_component\",\n      \"ingredient_ref\": \"ogs:core:ingredient:lime-juice\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sour\": 7.0,\n          \"sweet\": 1.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"fruit\",\n            \"group\": \"citrus\",\n            \"item\": \"lime\",\n            \"qualifier",
          "compliant": false,
          "score": 0.0,
          "detail": {
            "error": "no_json"
          },
          "metrics": {
            "input_tokens": 3736,
            "output_tokens": 1024,
            "latency_ms": 11974.1,
            "usd": 0.026568,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:dish:pico-de-gallo\",\n  \"ogs_version\": \"0.1.0\",\n  \"primary_language\": \"en\",\n  \"name\": \"Pico de Gallo\",\n  \"description\": \"A fresh Mexican salsa made with diced tomato, onion, cilantro, jalapeño, and lime juice. Served raw and uncooked.\",\n  \"cuisine\": \"Mexican\",\n  \"course\": \"appetizer\",\n  \"service_temperature\": \"cold\",\n  \"components\": [\n    {\n      \"role\": \"main_vegetable\",\n      \"ingredient_ref\": \"ogs:core:ingredient:tomato\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"proportion\": \"dominant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 4.0,\n          \"sour\": 3.5,\n          \"salty\": 0.5,\n          \"bitter\": 1.0,\n          \"umami\": 2.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"fresh vegetable\",\n            \"item\": \"tomato\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 6.0\n          },\n          {\n            \"family\": \"fruit\",\n            \"group\": \"berry\",\n            \"item\": \"tomato fruit\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 4.0\n          }\n        ],\n        \"texture\": [\"juicy\", \"soft\", \"pulpy\"]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:white-onion\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"proportion\": \"significant\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 2.0,\n          \"sour\": 0.5,\n          \"salty\": 0.0,\n          \"bitter\": 1.5,\n          \"umami\": 1.0\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"allium\",\n            \"item\": \"raw onion\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 5.5\n          }\n        ],\n        \"texture\": [\"crisp\", \"crunchy\"]\n      }\n    },\n    {\n      \"role\": \"aromatic_garnish\",\n      \"ingredient_ref\": \"ogs:core:ingredient:cilantro\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 1.0,\n          \"sour\": 0.5,\n          \"salty\": 0.0,\n          \"bitter\": 2.0,\n          \"umami\": 0.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"herbal\",\n            \"group\": \"soft herb\",\n            \"item\": \"cilantro\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 6.5\n          }\n        ],\n        \"texture\": [\"tender\", \"leafy\"]\n      }\n    },\n    {\n      \"role\": \"seasoning\",\n      \"ingredient_ref\": \"ogs:core:ingredient:jalapeno\",\n      \"cooking_method\": \"raw\",\n      \"maillard_index\": 0,\n      \"proportion\": \"accent\",\n      \"sensory_contribution\": {\n        \"basic_tastes\": {\n          \"sweet\": 1.5,\n          \"sour\": 0.5,\n          \"salty\": 0.0,\n          \"bitter\": 1.5,\n          \"umami\": 0.5\n        },\n        \"aromatics\": [\n          {\n            \"family\": \"vegetal\",\n            \"group\": \"pepper\",\n            \"item\": \"jalapeño\",\n            \"qualifier\": \"fresh\",\n            \"intensity\": 5.",
          "compliant": false,
          "score": 0.0,
          "detail": {
            "error": "no_json"
          },
          "metrics": {
            "input_tokens": 3736,
            "output_tokens": 1024,
            "latency_ms": 13008.7,
            "usd": 0.026568,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 0.0,
      "metrics": {
        "input_tokens_mean": 3736.0,
        "output_tokens_mean": 1023.667,
        "latency_ms_mean": 12394.967,
        "usd_sum": 0.079689
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-018",
      "category": "structured_output",
      "capability": "sensory_profile_json",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "json_match",
      "context_needs": [
        "scales.basic_tastes",
        "scales.structural",
        "vocab.aromatic_families",
        "vocab.qualifiers"
      ],
      "prompt_template_hash": "1470a28b8e9ef028",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Aromatic qualifiers\n\nfresh, dried, cooked, candied, fermented, zest, overripe\n\n---\n\nProduce a JSON sensory profile for 24-month aged Comté cheese. Keys: basic_tastes, structural, aromatics. All intensities on 0-10. JSON only.\n\nINPUT:\n{\n  \"item\": \"24-month aged Comté\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "value": {
          "basic_tastes": {
            "sweet": 2.0,
            "sour": 2.0,
            "salty": 5.0,
            "bitter": 1.5,
            "umami": 8.0
          },
          "structural": {
            "body": 7.0,
            "acidity_perception": 3.0,
            "tannin": 0,
            "astringency": 0,
            "alcohol_heat": 0,
            "effervescence": 0
          }
        },
        "required_keys": [
          "basic_tastes.umami",
          "basic_tastes.salty",
          "structural.body"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 4.5,\n    \"sour\": 3.0,\n    \"salty\": 5.5,\n    \"bitter\": 2.0,\n    \"umami\": 8.5\n  },\n  \"structural\": {\n    \"body\": 8.0,\n    \"acidity_perception\": 3.0,\n    \"tannin\": 0.5,\n    \"astringency\": 1.5,\n    \"alcohol_heat\": 0.0,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": [\n    {\n      \"family\": \"dairy\",\n      \"qualifier\": \"fermented\",\n      \"intensity\": 7.5\n    },\n    {\n      \"family\": \"savory\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 8.0\n    },\n    {\n      \"family\": \"spice\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 5.5\n    },\n    {\n      \"family\": \"earth\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 4.5\n    },\n    {\n      \"family\": \"confection\",\n      \"qualifier\": \"cooked\",\n      \"intensity\": 4.0\n    },\n    {\n      \"family\": \"fruit\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 3.5\n    },\n    {\n      \"family\": \"wood\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 3.0\n    },\n    {\n      \"family\": \"herbal\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 2.5\n    }\n  ]\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.umami": {
                "predicted": 8.5,
                "expected": 8.0,
                "match": true
              },
              "basic_tastes.salty": {
                "predicted": 5.5,
                "expected": 5.0,
                "match": true
              },
              "structural.body": {
                "predicted": 8.0,
                "expected": 7.0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 524,
            "output_tokens": 413,
            "latency_ms": 6582.1,
            "usd": 0.007767,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 4.5,\n    \"sour\": 3.0,\n    \"salty\": 5.5,\n    \"bitter\": 1.5,\n    \"umami\": 8.0\n  },\n  \"structural\": {\n    \"body\": 8.5,\n    \"acidity_perception\": 3.0,\n    \"tannin\": 0.5,\n    \"astringency\": 1.5,\n    \"alcohol_heat\": 0.0,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": [\n    {\n      \"family\": \"dairy\",\n      \"qualifier\": \"fermented\",\n      \"intensity\": 7.5\n    },\n    {\n      \"family\": \"savory\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 8.0\n    },\n    {\n      \"family\": \"spice\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 5.5\n    },\n    {\n      \"family\": \"fruit\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 5.0\n    },\n    {\n      \"family\": \"earth\",\n      \"qualifier\": \"fresh\",\n      \"intensity\": 4.5\n    },\n    {\n      \"family\": \"confection\",\n      \"qualifier\": \"cooked\",\n      \"intensity\": 4.0\n    },\n    {\n      \"family\": \"wood\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 3.0\n    },\n    {\n      \"family\": \"floral\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 2.5\n    }\n  ]\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.umami": {
                "predicted": 8.0,
                "expected": 8.0,
                "match": true
              },
              "basic_tastes.salty": {
                "predicted": 5.5,
                "expected": 5.0,
                "match": true
              },
              "structural.body": {
                "predicted": 8.5,
                "expected": 7.0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 524,
            "output_tokens": 413,
            "latency_ms": 5508.9,
            "usd": 0.007767,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 4.5,\n    \"sour\": 3.0,\n    \"salty\": 6.0,\n    \"bitter\": 1.5,\n    \"umami\": 8.5\n  },\n  \"structural\": {\n    \"body\": 8.5,\n    \"acidity_perception\": 3.0,\n    \"tannin\": 0.5,\n    \"astringency\": 1.5,\n    \"alcohol_heat\": 0.5,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": [\n    {\n      \"family\": \"dairy\",\n      \"qualifier\": \"fermented\",\n      \"intensity\": 7.5\n    },\n    {\n      \"family\": \"savory\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 8.0\n    },\n    {\n      \"family\": \"spice\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 5.5\n    },\n    {\n      \"family\": \"earth\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 4.5\n    },\n    {\n      \"family\": \"confection\",\n      \"qualifier\": \"cooked\",\n      \"intensity\": 5.0\n    },\n    {\n      \"family\": \"fruit\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 4.0\n    },\n    {\n      \"family\": \"wood\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 3.0\n    }\n  ]\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.umami": {
                "predicted": 8.5,
                "expected": 8.0,
                "match": true
              },
              "basic_tastes.salty": {
                "predicted": 6.0,
                "expected": 5.0,
                "match": true
              },
              "structural.body": {
                "predicted": 8.5,
                "expected": 7.0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 524,
            "output_tokens": 381,
            "latency_ms": 5239.8,
            "usd": 0.007287,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 524.0,
        "output_tokens_mean": 402.333,
        "latency_ms_mean": 5776.933,
        "usd_sum": 0.022821
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-019",
      "category": "structured_output",
      "capability": "sensory_profile_json",
      "difficulty": "hard",
      "cuisine_region": "japanese",
      "scorer": "json_match",
      "context_needs": [
        "scales.basic_tastes",
        "scales.structural",
        "vocab.aromatic_families",
        "vocab.qualifiers"
      ],
      "prompt_template_hash": "aa2726470aa7cfb8",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Basic tastes scale (0-10)\n\nEach of `sweet`, `sour`, `salty`, `bitter`, `umami` is rated on a\n0-10 intensity scale where 0 = not detectable, 5 = clearly present,\n10 = overwhelming. Use decimals when precision helps (e.g. 8.5).\n\n### Structural attributes scale (0-10)\n\n`body`, `acidity_perception`, `tannin`, `astringency`, `alcohol_heat`,\n`effervescence` are each rated on the same 0-10 scale as basic tastes.\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n### Aromatic qualifiers\n\nfresh, dried, cooked, candied, fermented, zest, overripe\n\n---\n\nProduce a JSON sensory profile for blistered shishito peppers lightly salted. Keys: basic_tastes, structural, aromatics.\n\nINPUT:\n{\n  \"item\": \"blistered shishito peppers with flaky salt\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "value": {
          "basic_tastes": {
            "sweet": 2.0,
            "sour": 1.0,
            "salty": 5.0,
            "bitter": 3.0,
            "umami": 2.0
          },
          "structural": {
            "body": 3.0,
            "acidity_perception": 2.0,
            "tannin": 0,
            "astringency": 1.0,
            "alcohol_heat": 0,
            "effervescence": 0
          }
        },
        "required_keys": [
          "basic_tastes.salty",
          "basic_tastes.bitter",
          "structural.body"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 3.5,\n    \"sour\": 1.0,\n    \"salty\": 4.5,\n    \"bitter\": 2.5,\n    \"umami\": 3.0\n  },\n  \"structural\": {\n    \"body\": 3.0,\n    \"acidity_perception\": 1.5,\n    \"tannin\": 1.0,\n    \"astringency\": 2.0,\n    \"alcohol_heat\": 1.5,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": [\n    {\n      \"family\": \"vegetal\",\n      \"qualifier\": \"cooked\",\n      \"intensity\": 7.0,\n      \"notes\": \"blistered green pepper, charred skin\"\n    },\n    {\n      \"family\": \"earth\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 3.5,\n      \"notes\": \"subtle smoky, char notes from blistering\"\n    },\n    {\n      \"family\": \"herbal\",\n      \"qualifier\": \"fresh\",\n      \"intensity\": 4.0,\n      \"notes\": \"grassy, green herb character\"\n    },\n    {\n      \"family\": \"savory\",\n      \"qualifier\": \"cooked\",\n      \"intensity\": 4.5,\n      \"notes\": \"flaky salt enhancement, mild savory depth\"\n    },\n    {\n      \"family\": \"spice\",\n      \"qualifier\": \"fresh\",\n      \"intensity\": 2.0,\n      \"notes\": \"occasional mild heat, gentle peppery warmth\"\n    }\n  ]\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.salty": {
                "predicted": 4.5,
                "expected": 5.0,
                "match": true
              },
              "basic_tastes.bitter": {
                "predicted": 2.5,
                "expected": 3.0,
                "match": true
              },
              "structural.body": {
                "predicted": 3.0,
                "expected": 3.0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 520,
            "output_tokens": 404,
            "latency_ms": 6139.1,
            "usd": 0.00762,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 3.5,\n    \"sour\": 1.0,\n    \"salty\": 4.5,\n    \"bitter\": 2.5,\n    \"umami\": 3.0\n  },\n  \"structural\": {\n    \"body\": 3.0,\n    \"acidity_perception\": 1.5,\n    \"tannin\": 1.0,\n    \"astringency\": 2.0,\n    \"alcohol_heat\": 1.5,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": [\n    {\n      \"family\": \"vegetal\",\n      \"qualifier\": \"cooked\",\n      \"intensity\": 7.0,\n      \"notes\": \"blistered green pepper, charred skin\"\n    },\n    {\n      \"family\": \"herbal\",\n      \"qualifier\": \"fresh\",\n      \"intensity\": 5.0,\n      \"notes\": \"grassy, green herb undertone\"\n    },\n    {\n      \"family\": \"spice\",\n      \"qualifier\": \"dried\",\n      \"intensity\": 2.5,\n      \"notes\": \"mild peppery warmth, occasional heat spike\"\n    },\n    {\n      \"family\": \"earth\",\n      \"qualifier\": \"cooked\",\n      \"intensity\": 3.0,\n      \"notes\": \"smoky char, light wood-fire nuance\"\n    },\n    {\n      \"family\": \"savory\",\n      \"qualifier\": \"cooked\",\n      \"intensity\": 3.5,\n      \"notes\": \"umami-forward blister, flaky salt enhancement\"\n    }\n  ]\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.salty": {
                "predicted": 4.5,
                "expected": 5.0,
                "match": true
              },
              "basic_tastes.bitter": {
                "predicted": 2.5,
                "expected": 3.0,
                "match": true
              },
              "structural.body": {
                "predicted": 3.0,
                "expected": 3.0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 520,
            "output_tokens": 408,
            "latency_ms": 5561.6,
            "usd": 0.00768,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"basic_tastes\": {\n    \"sweet\": 3.5,\n    \"sour\": 1.0,\n    \"salty\": 4.5,\n    \"bitter\": 2.5,\n    \"umami\": 3.0\n  },\n  \"structural\": {\n    \"body\": 3.0,\n    \"acidity_perception\": 1.5,\n    \"tannin\": 1.0,\n    \"astringency\": 2.0,\n    \"alcohol_heat\": 0.0,\n    \"effervescence\": 0.0\n  },\n  \"aromatics\": [\n    {\n      \"family\": \"vegetal\",\n      \"qualifier\": \"cooked\",\n      \"intensity\": 7.0,\n      \"notes\": \"blistered green pepper skin, charred vegetable\"\n    },\n    {\n      \"family\": \"spice\",\n      \"qualifier\": \"fresh\",\n      \"intensity\": 3.5,\n      \"notes\": \"mild capsicum warmth, occasional heat spike\"\n    },\n    {\n      \"family\": \"earth\",\n      \"qualifier\": \"cooked\",\n      \"intensity\": 4.5,\n      \"notes\": \"smoky char, light ash from high-heat blister\"\n    },\n    {\n      \"family\": \"herbal\",\n      \"qualifier\": \"fresh\",\n      \"intensity\": 3.0,\n      \"notes\": \"grassy green notes from raw pepper interior\"\n    },\n    {\n      \"family\": \"savory\",\n      \"qualifier\": \"cooked\",\n      \"intensity\": 4.0,\n      \"notes\": \"maillard-driven savory depth from blistered skin, flaky salt enhancement\"\n    }\n  ]\n}",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "per_key": {
              "basic_tastes.salty": {
                "predicted": 4.5,
                "expected": 5.0,
                "match": true
              },
              "basic_tastes.bitter": {
                "predicted": 2.5,
                "expected": 3.0,
                "match": true
              },
              "structural.body": {
                "predicted": 3.0,
                "expected": 3.0,
                "match": true
              }
            }
          },
          "metrics": {
            "input_tokens": 520,
            "output_tokens": 419,
            "latency_ms": 6741.7,
            "usd": 0.007845,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 520.0,
        "output_tokens_mean": 410.333,
        "latency_ms_mean": 6147.467,
        "usd_sum": 0.023145
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:struct-020",
      "category": "structured_output",
      "capability": "schema_valid_pairing",
      "difficulty": "expert",
      "cuisine_region": "mexican",
      "scorer": "schema_validation",
      "context_needs": [
        "schema.pairing",
        "vocab.match_types",
        "vocab.explanation_codes",
        "vocab.grade_scale",
        "id.format",
        "ogs_version"
      ],
      "prompt_template_hash": "5b115600e8dad71f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS pairing schema (`schema/pairing.schema.json`)\n\n```json\n{\n  \"$schema\": \"https://json-schema.org/draft/2020-12/schema\",\n  \"$id\": \"https://opengastronomy.org/schema/v0.2/pairing.schema.json\",\n  \"title\": \"OGS Pairing\",\n  \"description\": \"A scored, explained pairing between a dish and a beverage.\",\n  \"type\": \"object\",\n  \"required\": [\n    \"id\",\n    \"ogs_version\",\n    \"name\",\n    \"dish_ref\",\n    \"beverage_ref\",\n    \"match_types\",\n    \"overall_score\",\n    \"grade\",\n    \"explanation_codes\",\n    \"confidence\"\n  ],\n  \"properties\": {\n    \"id\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:pairing:[a-z0-9-]+$\"\n    },\n    \"ogs_version\": {\n      \"type\": \"string\",\n      \"pattern\": \"^0\\\\.(1|2)\\\\.\\\\d+$\"\n    },\n    \"primary_language\": {\n      \"$ref\": \"#/$defs/bcp47_tag\"\n    },\n    \"available_languages\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/bcp47_tag\"\n      }\n    },\n    \"name\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"description\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"dish_ref\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:dish:[a-z0-9-]+$\"\n    },\n    \"beverage_ref\": {\n      \"type\": \"string\",\n      \"pattern\": \"^ogs:[a-z0-9-]+:beverage:[a-z0-9-]+$\"\n    },\n    \"match_types\": {\n      \"type\": \"array\",\n      \"minItems\": 1,\n      \"items\": {\n        \"$ref\": \"#/$defs/match_type_entry\"\n      }\n    },\n    \"overall_score\": {\n      \"type\": \"integer\",\n      \"minimum\": 0,\n      \"maximum\": 100\n    },\n    \"grade\": {\n      \"type\": \"string\",\n      \"enum\": [\n        \"A+\",\n        \"A\",\n        \"B+\",\n        \"B\",\n        \"C+\",\n        \"C\",\n        \"D\",\n        \"F\"\n      ]\n    },\n    \"explanation_codes\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/explanation_code_entry\"\n      }\n    },\n    \"risks\": {\n      \"type\": \"array\",\n      \"items\": {\n        \"$ref\": \"#/$defs/risk_entry\"\n      }\n    },\n    \"confidence\": {\n      \"type\": \"number\",\n      \"minimum\": 0,\n      \"maximum\": 1\n    },\n    \"evidence\": {\n      \"$ref\": \"#/$defs/evidence\"\n    },\n    \"notes\": {\n      \"$ref\": \"#/$defs/lstring\"\n    },\n    \"metadata\": {\n      \"$ref\": \"#/$defs/metadata\"\n    }\n  },\n  \"additionalProperties\": true,\n  \"$defs\": {\n    \"bcp47_tag\": {\n      \"type\": \"string\",\n      \"pattern\": \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\"\n    },\n    \"lstring\": {\n      \"oneOf\": [\n        {\n          \"type\": \"string\",\n          \"minLength\": 1\n        },\n        {\n          \"type\": \"object\",\n          \"minProperties\": 1,\n          \"patternProperties\": {\n            \"^[A-Za-z]{2,3}(-[A-Za-z0-9]{2,8})*$\": {\n              \"type\": \"string\",\n              \"minLength\": 1\n            }\n          },\n          \"additionalProperties\": false\n        },\n        {\n          \"type\": \"object\",\n          \"required\": [\n            \"labels\"\n          ],\n          \"properties\": {\n            \"primary_language\": {\n              \"$ref\": \"#/$defs/bcp47_tag\"\n            },\n            \"labels\": {\n              \"type\": \"array\",\n              \"minItems\": 1,\n              \"items\": {\n                \"type\": \"object\",\n                \"required\": [\n                  \"value\",\n                  \"language\"\n                ],\n                \"properties\": {\n                  \"value\": {\n                    \"type\": \"string\",\n                    \"minLength\": 1\n                  },\n                  \"language\": {\n                    \"$ref\": \"#/$defs/bcp47_tag\"\n                  },\n                  \"script\": {\n                    \"type\": \"string\",\n                    \"pattern\": \"^[A-Z][a-z]{3}$\"\n                  },\n                  \"role\": {\n                    \"type\": \"string\",\n                    \"enum\": [\n                      \"preferred\",\n                      \"alternate\",\n                      \"transliteration\",\n                      \"descriptive\",\n                      \"loan\",\n                      \"historical\",\n                      \"deprecated\",\n                      \"ipa\"\n                    ]\n                  },\n                  \"transliteration_scheme\": {\n                    \"type\": \"string\"\n                  },\n                  \"translation\": {\n                    \"type\": \"object\",\n                    \"additionalProperties\": true\n                  }\n                },\n                \"additionalProperties\": false\n              }\n            }\n          },\n          \"additionalProperties\": false\n        }\n      ]\n    },\n    \"match_type_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"type\",\n        \"score\"\n      ],\n      \"properties\": {\n        \"type\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"harmony\",\n            \"contrast\",\n            \"bridge\",\n            \"intensity_alignment\",\n            \"structural_balance\",\n            \"refresh\"\n          ]\n        },\n        \"score\": {\n          \"type\": \"integer\",\n          \"minimum\": 0,\n          \"maximum\": 100\n        },\n        \"description\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"primary\": {\n          \"type\": \"boolean\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"explanation_code_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"code\"\n      ],\n      \"properties\": {\n        \"code\": {\n          \"type\": \"string\"\n        },\n        \"weight\": {\n          \"type\": \"number\",\n          \"minimum\": 0,\n          \"maximum\": 1\n        },\n        \"detail\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"risk_entry\": {\n      \"type\": \"object\",\n      \"required\": [\n        \"code\",\n        \"severity\"\n      ],\n      \"properties\": {\n        \"code\": {\n          \"type\": \"string\"\n        },\n        \"severity\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"low\",\n            \"medium\",\n            \"high\"\n          ]\n        },\n        \"description\": {\n          \"$ref\": \"#/$defs/lstring\"\n        },\n        \"mitigation\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": false\n    },\n    \"evidence\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"method\": {\n          \"type\": \"string\",\n          \"enum\": [\n            \"expert_panel\",\n            \"instrumental\",\n            \"computed\",\n            \"estimated\"\n          ]\n        },\n        \"sources\": {\n          \"type\": \"array\",\n          \"items\": {\n            \"type\": \"string\"\n          }\n        },\n        \"notes\": {\n          \"$ref\": \"#/$defs/lstring\"\n        }\n      },\n      \"additionalProperties\": true\n    },\n    \"metadata\": {\n      \"type\": \"object\",\n      \"properties\": {\n        \"created\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"updated\": {\n          \"type\": \"string\",\n          \"format\": \"date-time\"\n        },\n        \"source\": {\n          \"type\": \"string\"\n        },\n        \"license\": {\n          \"type\": \"string\"\n        }\n      },\n      \"additionalProperties\": true\n    }\n  }\n}\n```\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n### OGS Explanation Codes\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**negative**\n- `RISK_TANNIN_FISH` — Tannin-Fish Clash: Tannins interact with fish oils to produce metallic, unpleasant off-flavors.\n- `RISK_ALCOHOL_OVERPOWERS` — Alcohol Overpowers Dish: High alcohol content overwhelms the dish's delicate flavors with heat and bitterness.\n- `RISK_BITTER_AMPLIFY` — Bitterness Amplification: Bitterness in both dish and beverage compounds to an unpleasant level.\n- `RISK_OAK_MASKS` — Oak Masks Dish: Heavy oak influence in the beverage masks or overwhelms the dish's subtle flavors.\n- `RISK_INTENSITY_MISMATCH` — Intensity Mismatch: One element is significantly more intense than the other, causing the weaker to be lost.\n- `RISK_ACID_CLASH` — Acid Clash: Excessive acidity from both dish and beverage creates an unpleasantly sharp experience.\n- `RISK_SWEETNESS_CONFLICT` — Sweetness Conflict: The dish makes the beverage taste less sweet, thin, or overly acidic.\n- `RISK_SPICE_ALCOHOL` — Spice-Alcohol Amplification: High alcohol amplifies the heat of spicy dishes to an uncomfortable level.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n### OGS ID format\n\nAll OGS entity IDs follow the pattern:\n\n    ogs:<namespace>:<entity-type>:<slug>\n\n- `namespace` is lowercase alphanumeric with hyphens, e.g. `core`.\n- `entity-type` is one of `ingredient`, `dish`, `beverage`, `pairing`,\n  `benchmark-task`, etc.\n- `slug` is lowercase alphanumeric with hyphens.\n\nExample: `ogs:core:ingredient:fleur-de-sel`.\n\n### OGS version\n\nAll produced documents must set `ogs_version` to **\"0.1.0\"**.\n\n---\n\nProduce an OGS pairing JSON for mole-negro enchiladas with Aragonese Garnacha. JSON only.\n\nINPUT:\n{\n  \"pairing_slug\": \"mole-negro-garnacha\",\n  \"dish_ref\": \"ogs:core:dish:mole-negro-enchiladas\",\n  \"beverage_ref\": \"ogs:core:beverage:garnacha-aragon\",\n  \"intent\": \"Cross-cultural bridge pairing: ripe fruit and body handle chocolate-chile sauce.\"\n}\n\nANSWER FORMAT: json",
      "expected": {
        "schema_ref": "schema/pairing.schema.json",
        "required_fields": {
          "id": "ogs:core:pairing:mole-negro-garnacha",
          "ogs_version": "0.1.0",
          "dish_ref": "ogs:core:dish:mole-negro-enchiladas",
          "beverage_ref": "ogs:core:beverage:garnacha-aragon"
        }
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:mole-negro-garnacha\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Mole Negro Enchiladas with Aragonese Garnacha\",\n  \"dish_ref\": \"ogs:core:dish:mole-negro-enchiladas\",\n  \"beverage_ref\": \"ogs:core:beverage:garnacha-aragon\",\n  \"match_types\": [\n    {\n      \"type\": \"bridge\",\n      \"score\": 88,\n      \"description\": \"Dried fruit, chocolate, and spice notes in mole negro find a bridge in Garnacha's ripe dark fruit and subtle spice character.\",\n      \"primary\": true\n    },\n    {\n      \"type\": \"intensity_alignment\",\n      \"score\": 82,\n      \"description\": \"The full-bodied weight of Aragonese Garnacha aligns with the deep, complex intensity of mole negro sauce.\"\n    },\n    {\n      \"type\": \"harmony\",\n      \"score\": 79,\n      \"description\": \"Shared earthy, spiced, and dark-fruit flavor profiles create a harmonious overall experience.\"\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 75,\n      \"description\": \"Tannins in Garnacha interact with the fat and protein of the enchilada filling, softening the wine while enriching the dish.\"\n    }\n  ],\n  \"overall_score\": 84,\n  \"grade\": \"B\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"AROMA_SHARED_SPICE\",\n      \"weight\": 0.25,\n      \"detail\": \"Mole negro's complex spice blend—chiles, cinnamon, cumin—mirrors the peppery, warm-spice notes of Aragonese Garnacha.\"\n    },\n    {\n      \"code\": \"AROMA_SHARED_EARTH\",\n      \"weight\": 0.18,\n      \"detail\": \"Earthy, dried-chile complexity in mole negro resonates with the earthy undertones present in Garnacha from Aragon.\"\n    },\n    {\n      \"code\": \"AROMA_SHARED_FRUIT\",\n      \"weight\": 0.15,\n      \"detail\": \"Dried fruit elements in the mole (raisins, prunes) echo the ripe dark-fruit character of Garnacha.\"\n    },\n    {\n      \"code\": \"STRUCT_BODY_ALIGN\",\n      \"weight\": 0.15,\n      \"detail\": \"Garnacha's full body matches the rich, thick consistency of mole negro without being overwhelmed.\"\n    },\n    {\n      \"code\": \"STRUCT_INTENSITY_MATCH\",\n      \"weight\": 0.12,\n      \"detail\": \"Both mole negro and Aragonese Garnacha operate at high flavor intensity, ensuring neither element is lost.\"\n    },\n    {\n      \"code\": \"STRUCT_TANNIN_PROTEIN\",\n      \"weight\": 0.10,\n      \"detail\": \"Moderate tannins in Garnacha bind with proteins in the enchilada chicken or turkey filling, reducing astringency.\"\n    },\n    {\n      \"code\": \"HARMONY_COMPLEXITY_MATCH\",\n      \"weight\": 0.05,\n      \"detail\": \"The multi-layered complexity of a traditional mole negro, with 20+ ingredients, is matched by Garnacha's layered aromatic profile.\"\n    }\n  ],\n  \"risks\": [\n    {\n      \"code\": \"RISK_SPICE_ALCOHOL\",\n      \"severity\": \"medium\",\n      \"description\": \"Garnacha's typically elevated alcohol (14-15% ABV) may amplify chile heat in the mole, potentially intensifying the spice beyond comfort.\",\n      \"mitigation\": \"Select a lower-alcohol expression of Aragonese Garnacha or ensure the mole's chile heat is moderate rather than aggressive.\"\n    },\n    {\n      \"code\": \"RISK_OAK_MASKS\",\n      \"severity\": \"low\",\n      \"description\": \"Heavily oaked Garnacha expressions may overlay the delicate chocolate and floral notes within",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "[{'type': 'bridge', 'score': 88, 'description': \"Dried fruit, chocolate, and spice notes in mole negro find a bridge in Garnacha's ripe dark fruit and subtle spice character.\", 'primary': True}, {'type': 'intensity_alignment', 'score': 82, 'description': 'The full-bodied weight of Aragonese Garnacha aligns with the deep, complex intensity of mole negro sauce.'}, {'type': 'harmony', 'score': 79, 'description': 'Shared earthy, spiced, and dark-fruit flavor profiles create a harmonious overall experience.'}, {'type': 'structural_balance', 'score': 75, 'description': 'Tannins in Garnacha interact with the fat and protein of the enchilada filling, softening the wine while enriching the dish.'}] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:pairing:mole-negro-garnacha",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              },
              "dish_ref": {
                "predicted": null,
                "expected": "ogs:core:dish:mole-negro-enchiladas",
                "match": false
              },
              "beverage_ref": {
                "predicted": null,
                "expected": "ogs:core:beverage:garnacha-aragon",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 4192,
            "output_tokens": 1024,
            "latency_ms": 16223.0,
            "usd": 0.027936,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        },
        {
          "sample": 1,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:mole-negro-garnacha\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Mole Negro Enchiladas with Aragonese Garnacha\",\n  \"dish_ref\": \"ogs:core:dish:mole-negro-enchiladas\",\n  \"beverage_ref\": \"ogs:core:beverage:garnacha-aragon\",\n  \"match_types\": [\n    {\n      \"type\": \"bridge\",\n      \"score\": 88,\n      \"description\": \"Ripe dark fruit in Garnacha bridges the dried chile and bittersweet chocolate notes of mole negro.\",\n      \"primary\": true\n    },\n    {\n      \"type\": \"intensity_alignment\",\n      \"score\": 84,\n      \"description\": \"The full body and high flavor intensity of both elements keep neither from overpowering the other.\"\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 80,\n      \"description\": \"Soft tannins in Garnacha are absorbed by the rich fat and protein of the enchilada filling, while moderate acidity brightens the dense sauce.\"\n    },\n    {\n      \"type\": \"harmony\",\n      \"score\": 78,\n      \"description\": \"Shared spice and earthy complexity in both mole negro and Garnacha create a cohesive flavor experience.\"\n    }\n  ],\n  \"overall_score\": 85,\n  \"grade\": \"B+\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"AROMA_SHARED_SPICE\",\n      \"weight\": 0.30,\n      \"detail\": \"Dried chile, cinnamon, and clove aromatics in mole negro echo the warm spice notes typical of Aragonese Garnacha.\"\n    },\n    {\n      \"code\": \"AROMA_SHARED_EARTH\",\n      \"weight\": 0.15,\n      \"detail\": \"Earthy, cocoa-driven notes in the mole align with the garrigue and mineral undertones of Garnacha.\"\n    },\n    {\n      \"code\": \"STRUCT_INTENSITY_MATCH\",\n      \"weight\": 0.20,\n      \"detail\": \"Both the complex, full-bodied mole sauce and the ripe, high-alcohol Garnacha operate at comparable flavor intensity levels.\"\n    },\n    {\n      \"code\": \"STRUCT_TANNIN_PROTEIN\",\n      \"weight\": 0.15,\n      \"detail\": \"Proteins from chicken or pork filling bind soft Garnacha tannins, reducing astringency and smoothing the finish.\"\n    },\n    {\n      \"code\": \"STRUCT_ACIDITY_BALANCE\",\n      \"weight\": 0.10,\n      \"detail\": \"Garnacha's moderate acidity cuts through the richness of the mole and melted cheese without overwhelming the dish.\"\n    },\n    {\n      \"code\": \"AROMA_SHARED_FRUIT\",\n      \"weight\": 0.10,\n      \"detail\": \"Ripe plum and dark cherry fruit in Garnacha bridges the subtle sweetness from mulato and ancho chiles in the mole.\"\n    },\n    {\n      \"code\": \"RISK_SPICE_ALCOHOL\",\n      \"weight\": 0.05,\n      \"detail\": \"Elevated alcohol in Garnacha may marginally amplify residual chile heat; manageable given the richness of the sauce.\"\n    }\n  ],\n  \"risks\": [\n    {\n      \"code\": \"RISK_SPICE_ALCOHOL\",\n      \"severity\": \"low\",\n      \"description\": \"Higher alcohol levels in Aragonese Garnacha can accentuate chile heat in the mole negro.\",\n      \"mitigation\": \"Choose a Garnacha with moderate alcohol (13.5–14.5%) and serve slightly cool (16–17°C) to temper perceived heat.\"\n    },\n    {\n      \"code\": \"RISK_OAK_MASKS\",\n      \"severity\": \"low\",\n      \"description\": \"Heavily oaked Garnacha expressions may mask the nuanced cocoa and dried fruit complexity of the mole.\",\n      \"mitigation",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "[{'type': 'bridge', 'score': 88, 'description': 'Ripe dark fruit in Garnacha bridges the dried chile and bittersweet chocolate notes of mole negro.', 'primary': True}, {'type': 'intensity_alignment', 'score': 84, 'description': 'The full body and high flavor intensity of both elements keep neither from overpowering the other.'}, {'type': 'structural_balance', 'score': 80, 'description': 'Soft tannins in Garnacha are absorbed by the rich fat and protein of the enchilada filling, while moderate acidity brightens the dense sauce.'}, {'type': 'harmony', 'score': 78, 'description': 'Shared spice and earthy complexity in both mole negro and Garnacha create a cohesive flavor experience.'}] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:pairing:mole-negro-garnacha",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              },
              "dish_ref": {
                "predicted": null,
                "expected": "ogs:core:dish:mole-negro-enchiladas",
                "match": false
              },
              "beverage_ref": {
                "predicted": null,
                "expected": "ogs:core:beverage:garnacha-aragon",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 4192,
            "output_tokens": 1024,
            "latency_ms": 16036.2,
            "usd": 0.027936,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        },
        {
          "sample": 2,
          "raw_response": "{\n  \"id\": \"ogs:core:pairing:mole-negro-garnacha\",\n  \"ogs_version\": \"0.1.0\",\n  \"name\": \"Mole Negro Enchiladas with Aragonese Garnacha\",\n  \"dish_ref\": \"ogs:core:dish:mole-negro-enchiladas\",\n  \"beverage_ref\": \"ogs:core:beverage:garnacha-aragon\",\n  \"match_types\": [\n    {\n      \"type\": \"bridge\",\n      \"score\": 88,\n      \"description\": \"Ripe dark fruit and spice notes in the Garnacha bridge the complex chocolate-chile mole sauce.\",\n      \"primary\": true\n    },\n    {\n      \"type\": \"harmony\",\n      \"score\": 82,\n      \"description\": \"The earthy, spiced depth of mole negro converges with the warm, full-bodied character of Aragonese Garnacha.\"\n    },\n    {\n      \"type\": \"structural_balance\",\n      \"score\": 79,\n      \"description\": \"The wine's moderate tannins and body align with the rich, dense mole sauce.\"\n    },\n    {\n      \"type\": \"intensity_alignment\",\n      \"score\": 84,\n      \"description\": \"Both mole negro and Garnacha are bold, complex, and deeply flavored, keeping neither element eclipsed.\"\n    }\n  ],\n  \"overall_score\": 85,\n  \"grade\": \"B+\",\n  \"explanation_codes\": [\n    {\n      \"code\": \"AROMA_SHARED_SPICE\",\n      \"weight\": 0.25,\n      \"detail\": \"Cinnamon, cumin, and dried chile spice notes resonate between the mole sauce and the peppery, spiced Garnacha.\"\n    },\n    {\n      \"code\": \"AROMA_SHARED_EARTH\",\n      \"weight\": 0.15,\n      \"detail\": \"Earthy, dark mineral tones in both the mole negro and the Aragonese terroir-driven Garnacha create an aromatic bridge.\"\n    },\n    {\n      \"code\": \"AROMA_SHARED_FRUIT\",\n      \"weight\": 0.15,\n      \"detail\": \"Dark berry and dried fruit aromatics in the Garnacha echo the raisin and prune notes found in mole negro.\"\n    },\n    {\n      \"code\": \"STRUCT_INTENSITY_MATCH\",\n      \"weight\": 0.20,\n      \"detail\": \"Mole negro's bold, layered intensity is well-matched by the Garnacha's ripe, full-bodied presence.\"\n    },\n    {\n      \"code\": \"STRUCT_TANNIN_PROTEIN\",\n      \"weight\": 0.10,\n      \"detail\": \"Proteins in the enchilada filling soften the Garnacha's tannins, improving mouthfeel integration.\"\n    },\n    {\n      \"code\": \"STRUCT_TANNIN_FAT\",\n      \"weight\": 0.10,\n      \"detail\": \"Fat from the mole sauce and cheese mellows any residual tannin astringency in the wine.\"\n    },\n    {\n      \"code\": \"HARMONY_COMPLEXITY_MATCH\",\n      \"weight\": 0.15,\n      \"detail\": \"Both elements boast multi-layered complexity—chocolate, chile, and spice in the mole; fruit, earth, and warmth in the Garnacha—ensuring neither overshadows the other.\"\n    },\n    {\n      \"code\": \"RISK_SPICE_ALCOHOL\",\n      \"weight\": 0.10,\n      \"detail\": \"Higher alcohol levels in Garnacha may slightly amplify the heat of the dried chiles in the mole.\"\n    }\n  ],\n  \"risks\": [\n    {\n      \"code\": \"RISK_SPICE_ALCOHOL\",\n      \"severity\": \"low\",\n      \"description\": \"The Garnacha's alcohol content (typically 14-15%) could amplify the chile heat of the mole negro.\",\n      \"mitigation\": \"Select a Garnacha with moderate alcohol (under 14.5%) or serve slightly chilled to reduce the perception of alcohol heat.\"",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "schema_valid": false,
            "schema_errors": [
              "[{'type': 'bridge', 'score': 88, 'description': 'Ripe dark fruit and spice notes in the Garnacha bridge the complex chocolate-chile mole sauce.', 'primary': True}, {'type': 'harmony', 'score': 82, 'description': 'The earthy, spiced depth of mole negro converges with the warm, full-bodied character of Aragonese Garnacha.'}, {'type': 'structural_balance', 'score': 79, 'description': \"The wine's moderate tannins and body align with the rich, dense mole sauce.\"}, {'type': 'intensity_alignment', 'score': 84, 'description': 'Both mole negro and Garnacha are bold, complex, and deeply flavored, keeping neither element eclipsed.'}] is not of type 'object'"
            ],
            "consistent": false,
            "required_fields": {
              "id": {
                "predicted": null,
                "expected": "ogs:core:pairing:mole-negro-garnacha",
                "match": false
              },
              "ogs_version": {
                "predicted": null,
                "expected": "0.1.0",
                "match": false
              },
              "dish_ref": {
                "predicted": null,
                "expected": "ogs:core:dish:mole-negro-enchiladas",
                "match": false
              },
              "beverage_ref": {
                "predicted": null,
                "expected": "ogs:core:beverage:garnacha-aragon",
                "match": false
              }
            }
          },
          "metrics": {
            "input_tokens": 4192,
            "output_tokens": 1024,
            "latency_ms": 16932.2,
            "usd": 0.027936,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "max_tokens"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 4192.0,
        "output_tokens_mean": 1024.0,
        "latency_ms_mean": 16397.133,
        "usd_sum": 0.083808
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-001",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "e52270d817a9dc7f",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nUsing the OGS sensory model, return the single aromatic family (from the controlled vocabulary) that best classifies the aromatic item below. Respond with just the family name.\n\nINPUT:\n{\n  \"item\": \"lemon zest\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "fruit"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "fruit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fruit",
            "expected": "fruit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 381,
            "output_tokens": 4,
            "latency_ms": 1138.2,
            "usd": 0.001203,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "fruit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fruit",
            "expected": "fruit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 381,
            "output_tokens": 4,
            "latency_ms": 1209.0,
            "usd": 0.001203,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "fruit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fruit",
            "expected": "fruit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 381,
            "output_tokens": 4,
            "latency_ms": 1130.6,
            "usd": 0.001203,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 381.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1159.267,
        "usd_sum": 0.003609
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-002",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "e2096242953ff882",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nUsing the OGS sensory model, return the single aromatic family that best classifies the item below. Respond with just the family name.\n\nINPUT:\n{\n  \"item\": \"wet stone\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "earth"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "earth",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "earth",
            "expected": "earth",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 371,
            "output_tokens": 4,
            "latency_ms": 1424.8,
            "usd": 0.001173,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "earth",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "earth",
            "expected": "earth",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 371,
            "output_tokens": 4,
            "latency_ms": 1846.1,
            "usd": 0.001173,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "earth",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "earth",
            "expected": "earth",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 371,
            "output_tokens": 4,
            "latency_ms": 1316.0,
            "usd": 0.001173,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 371.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1528.967,
        "usd_sum": 0.003519
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-003",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "fa5fa32bfe275dfb",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nUsing the OGS sensory model, return the single aromatic family that best classifies the item below. Respond with just the family name.\n\nINPUT:\n{\n  \"item\": \"cooked beef fat\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "savory"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "savory",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "savory",
            "expected": "savory",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 373,
            "output_tokens": 5,
            "latency_ms": 1268.3,
            "usd": 0.001194,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "savory",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "savory",
            "expected": "savory",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 373,
            "output_tokens": 5,
            "latency_ms": 762.8,
            "usd": 0.001194,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "savory",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "savory",
            "expected": "savory",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 373,
            "output_tokens": 5,
            "latency_ms": 1365.8,
            "usd": 0.001194,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 373.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1132.3,
        "usd_sum": 0.003582
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-004",
      "category": "vocabulary",
      "capability": "cooking_method_vocabulary",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "e0631034b8b8ece4",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nPick the single OGS cooking method that best describes a protein slowly cooked while fully submerged in fat at a low temperature. Respond with just the method name from the OGS vocabulary.\n\nINPUT:\n{\n  \"description\": \"duck legs slow-cooked submerged in their own rendered fat at 80 C\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "confit"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "confit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "confit",
            "expected": "confit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 470,
            "output_tokens": 5,
            "latency_ms": 3034.1,
            "usd": 0.001485,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "confit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "confit",
            "expected": "confit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 470,
            "output_tokens": 5,
            "latency_ms": 2066.7,
            "usd": 0.001485,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "confit",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "confit",
            "expected": "confit",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 470,
            "output_tokens": 5,
            "latency_ms": 2298.1,
            "usd": 0.001485,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 470.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 2466.3,
        "usd_sum": 0.004455
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-005",
      "category": "vocabulary",
      "capability": "match_type_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "bbbd81bb3c8b8ea4",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nSelect the single OGS match type that best describes a pairing whose strength is driven by a specific shared aromatic compound or family between dish and beverage.\n\nINPUT:\n{\n  \"description\": \"dish and wine are linked by a shared earthy, forest-floor aromatic\"\n}\n\nANSWER FORMAT: one_of:harmony,contrast,bridge,intensity_alignment,structural_balance,refresh",
      "expected": {
        "value": "bridge"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "bridge",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "bridge",
            "expected": "bridge",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 361,
            "output_tokens": 4,
            "latency_ms": 1746.0,
            "usd": 0.001143,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "bridge",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "bridge",
            "expected": "bridge",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 361,
            "output_tokens": 4,
            "latency_ms": 1023.6,
            "usd": 0.001143,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "bridge",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "bridge",
            "expected": "bridge",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 361,
            "output_tokens": 4,
            "latency_ms": 1091.9,
            "usd": 0.001143,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 361.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1287.167,
        "usd_sum": 0.003429
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-006",
      "category": "vocabulary",
      "capability": "component_role_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "6c6ae6f82f354be3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nReturn the single OGS component role that best fits the described element. Use only roles from the OGS controlled vocabulary.\n\nINPUT:\n{\n  \"element\": \"a soy-dashi based liquid ladled over the rice bowl, providing salt, umami and moisture\"\n}\n\nANSWER FORMAT: one_of:main_protein,main_vegetable,starch,sauce,fat_component,acid_component,aromatic_garnish,seasoning,textural_element",
      "expected": {
        "value": "sauce"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "sauce",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sauce",
            "expected": "sauce",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 400,
            "output_tokens": 4,
            "latency_ms": 1318.7,
            "usd": 0.00126,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "sauce",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sauce",
            "expected": "sauce",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 400,
            "output_tokens": 4,
            "latency_ms": 835.6,
            "usd": 0.00126,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "sauce",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "sauce",
            "expected": "sauce",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 400,
            "output_tokens": 4,
            "latency_ms": 780.1,
            "usd": 0.00126,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 400.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 978.133,
        "usd_sum": 0.00378
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-007",
      "category": "vocabulary",
      "capability": "qualifier_vocabulary",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.qualifiers"
      ],
      "prompt_template_hash": "0adac8c06022acf5",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic qualifiers\n\nfresh, dried, cooked, candied, fermented, zest, overripe\n\n---\n\nSelect the OGS aromatic qualifier that best describes how the item is expressed.\n\nINPUT:\n{\n  \"item\": \"candied ginger\"\n}\n\nANSWER FORMAT: one_of:fresh,dried,cooked,candied,fermented,zest,overripe",
      "expected": {
        "value": "candied"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "candied",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "candied",
            "expected": "candied",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 344,
            "output_tokens": 5,
            "latency_ms": 884.7,
            "usd": 0.001107,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "candied",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "candied",
            "expected": "candied",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 344,
            "output_tokens": 5,
            "latency_ms": 902.3,
            "usd": 0.001107,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "candied",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "candied",
            "expected": "candied",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 344,
            "output_tokens": 5,
            "latency_ms": 778.4,
            "usd": 0.001107,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 344.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 855.133,
        "usd_sum": 0.003321
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-008",
      "category": "vocabulary",
      "capability": "grade_scale",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.grade_scale"
      ],
      "prompt_template_hash": "e58319dc1276cdb2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n---\n\nUsing the OGS pairing grade scale (ogs-match.md §5), return the letter grade for the given overall score. Respond with just the grade token, e.g. 'A+'.\n\nINPUT:\n{\n  \"overall_score\": 92\n}\n\nANSWER FORMAT: one_of:A+,A,B+,B,C+,C,D,F",
      "expected": {
        "value": "A"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "A",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "A",
            "expected": "A",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 432,
            "output_tokens": 4,
            "latency_ms": 1200.5,
            "usd": 0.001356,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "A",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "A",
            "expected": "A",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 432,
            "output_tokens": 4,
            "latency_ms": 1109.3,
            "usd": 0.001356,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "A",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "A",
            "expected": "A",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 432,
            "output_tokens": 4,
            "latency_ms": 1249.5,
            "usd": 0.001356,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 432.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1186.433,
        "usd_sum": 0.004068
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-009",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "easy",
      "cuisine_region": "spanish",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "d51864ab27ecfa64",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nUsing the OGS sensory model, return the single aromatic family that best characterises smoked paprika. Respond with just the family name.\n\nINPUT:\n{\n  \"item\": \"smoked paprika (pimentón de la Vera)\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "spice"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "spice",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "spice",
            "expected": "spice",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 382,
            "output_tokens": 5,
            "latency_ms": 946.6,
            "usd": 0.001221,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "spice",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "spice",
            "expected": "spice",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 382,
            "output_tokens": 5,
            "latency_ms": 649.7,
            "usd": 0.001221,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "spice",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "spice",
            "expected": "spice",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 382,
            "output_tokens": 5,
            "latency_ms": 682.8,
            "usd": 0.001221,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 382.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 759.7,
        "usd_sum": 0.003663
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-010",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "medium",
      "cuisine_region": "japanese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "1748069ad0ebdde7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nUsing the OGS sensory model, return the single dominant aromatic family of a classic kombu-katsuo dashi.\n\nINPUT:\n{\n  \"item\": \"classic kombu-katsuobushi dashi\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "marine"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "marine",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "marine",
            "expected": "marine",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 376,
            "output_tokens": 4,
            "latency_ms": 970.6,
            "usd": 0.001188,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "marine",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "marine",
            "expected": "marine",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 376,
            "output_tokens": 4,
            "latency_ms": 816.7,
            "usd": 0.001188,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "marine",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "marine",
            "expected": "marine",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 376,
            "output_tokens": 4,
            "latency_ms": 1431.4,
            "usd": 0.001188,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 376.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1072.9,
        "usd_sum": 0.003564
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-011",
      "category": "vocabulary",
      "capability": "cooking_method_vocabulary",
      "difficulty": "easy",
      "cuisine_region": "indian",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "85de6add5c1ebfba",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nPick the single OGS cooking method that best matches tandoori chicken cooked at 450°C in a clay oven.\n\nINPUT:\n{\n  \"description\": \"tandoori chicken, marinated and cooked in a clay tandoor at ~450°C\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "roasted"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "charred",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "charred",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 457,
            "output_tokens": 5,
            "latency_ms": 808.1,
            "usd": 0.001446,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "charred",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "charred",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 457,
            "output_tokens": 5,
            "latency_ms": 905.6,
            "usd": 0.001446,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "charred",
          "compliant": true,
          "score": 0.0,
          "detail": {
            "predicted": "charred",
            "expected": "roasted",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 457,
            "output_tokens": 5,
            "latency_ms": 1115.5,
            "usd": 0.001446,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.0,
      "score_summary": {
        "mean": 0.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.0,
        "ci95_hi": 0.0,
        "n": 3
      },
      "score_if_valid": 0.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 457.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 943.067,
        "usd_sum": 0.004338
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-012",
      "category": "vocabulary",
      "capability": "cooking_method_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "peruvian",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "44c9b26dc70c4557",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nPick the single OGS cooking method that best describes Peruvian ceviche's treatment of white fish in lime juice.\n\nINPUT:\n{\n  \"description\": \"diced white fish denatured in freshly squeezed lime juice for 10 minutes with red onion and aji limo\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "cured"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "cured",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cured",
            "expected": "cured",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 465,
            "output_tokens": 5,
            "latency_ms": 1044.9,
            "usd": 0.00147,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "cured",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cured",
            "expected": "cured",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 465,
            "output_tokens": 5,
            "latency_ms": 1120.9,
            "usd": 0.00147,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "cured",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "cured",
            "expected": "cured",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 465,
            "output_tokens": 5,
            "latency_ms": 1160.5,
            "usd": 0.00147,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 465.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1108.767,
        "usd_sum": 0.00441
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-013",
      "category": "vocabulary",
      "capability": "component_role_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "moroccan",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "dc3e62daf8bdf21a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nReturn the single OGS component role that best fits harissa stirred into a lamb tagine at service.\n\nINPUT:\n{\n  \"description\": \"a spoonful of fiery harissa stirred into a lamb-and-apricot tagine at service\"\n}\n\nANSWER FORMAT: one_of:main_protein,main_vegetable,starch,sauce,fat_component,acid_component,aromatic_garnish,seasoning,textural_element",
      "expected": {
        "value": "seasoning"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "seasoning",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "seasoning",
            "expected": "seasoning",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 399,
            "output_tokens": 5,
            "latency_ms": 953.2,
            "usd": 0.001272,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "seasoning",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "seasoning",
            "expected": "seasoning",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 399,
            "output_tokens": 5,
            "latency_ms": 777.4,
            "usd": 0.001272,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "seasoning",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "seasoning",
            "expected": "seasoning",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 399,
            "output_tokens": 5,
            "latency_ms": 708.4,
            "usd": 0.001272,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 399.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 813.0,
        "usd_sum": 0.003816
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-014",
      "category": "vocabulary",
      "capability": "qualifier_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "chinese",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.qualifiers"
      ],
      "prompt_template_hash": "3980456a47e9b7c6",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic qualifiers\n\nfresh, dried, cooked, candied, fermented, zest, overripe\n\n---\n\nSelect the OGS aromatic qualifier that best describes the state of Sichuan peppercorn as it is commonly used — whole, toasted, and ground over finished mapo tofu.\n\nINPUT:\n{\n  \"item\": \"Sichuan peppercorn (Zanthoxylum), whole, pan-toasted just before grinding over the dish\"\n}\n\nANSWER FORMAT: one_of:fresh,dried,cooked,candied,fermented,zest,overripe",
      "expected": {
        "value": "dried"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "dried",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "dried",
            "expected": "dried",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 392,
            "output_tokens": 4,
            "latency_ms": 743.3,
            "usd": 0.001236,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "dried",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "dried",
            "expected": "dried",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 392,
            "output_tokens": 4,
            "latency_ms": 832.3,
            "usd": 0.001236,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "dried",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "dried",
            "expected": "dried",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 392,
            "output_tokens": 4,
            "latency_ms": 709.4,
            "usd": 0.001236,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 392.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 761.667,
        "usd_sum": 0.003708
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-015",
      "category": "vocabulary",
      "capability": "grade_scale",
      "difficulty": "easy",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.grade_scale"
      ],
      "prompt_template_hash": "ad313a98974d64f7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n---\n\nUsing the OGS pairing grade scale, return the letter grade for a pairing whose `overall_score` is 78.\n\nINPUT:\n{\n  \"overall_score\": 78\n}\n\nANSWER FORMAT: one_of:A+,A,B+,B,C+,C,D,F",
      "expected": {
        "value": "C+"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "C+",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "c+",
            "expected": "c+",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 414,
            "output_tokens": 5,
            "latency_ms": 1301.8,
            "usd": 0.001317,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "C+",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "c+",
            "expected": "c+",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 414,
            "output_tokens": 5,
            "latency_ms": 983.7,
            "usd": 0.001317,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "C+",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "c+",
            "expected": "c+",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 414,
            "output_tokens": 5,
            "latency_ms": 1402.9,
            "usd": 0.001317,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 414.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 1229.467,
        "usd_sum": 0.003951
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-016",
      "category": "vocabulary",
      "capability": "match_type_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "british",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.match_types"
      ],
      "prompt_template_hash": "95e41ab82cba029a",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Match types\n\nharmony, contrast, bridge, intensity_alignment, structural_balance, refresh\n\n---\n\nSelect the single OGS match type that best describes a beverage whose carbonation physically scrubs the palate between bites of a rich fried dish.\n\nINPUT:\n{\n  \"description\": \"Champagne cleanses the palate between bites of fish and chips\"\n}\n\nANSWER FORMAT: one_of:harmony,contrast,bridge,intensity_alignment,structural_balance,refresh",
      "expected": {
        "value": "refresh"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "refresh",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "refresh",
            "expected": "refresh",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 360,
            "output_tokens": 4,
            "latency_ms": 989.3,
            "usd": 0.00114,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "refresh",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "refresh",
            "expected": "refresh",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 360,
            "output_tokens": 4,
            "latency_ms": 1365.6,
            "usd": 0.00114,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "refresh",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "refresh",
            "expected": "refresh",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 360,
            "output_tokens": 4,
            "latency_ms": 1248.3,
            "usd": 0.00114,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 360.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1201.067,
        "usd_sum": 0.00342
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-017",
      "category": "vocabulary",
      "capability": "aromatic_family_classification",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.aromatic_families"
      ],
      "prompt_template_hash": "8070fb687fd2f108",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Aromatic families\n\nfruit, floral, herbal, spice, earth, wood, dairy, savory, confection, vegetal, marine\n\n---\n\nUsing the OGS sensory model, return the single dominant aromatic family of fresh cilantro leaves.\n\nINPUT:\n{\n  \"item\": \"fresh cilantro / coriander leaves\"\n}\n\nANSWER FORMAT: one_of:fruit,floral,herbal,spice,earth,wood,dairy,savory,confection,vegetal,marine",
      "expected": {
        "value": "herbal"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "herbal",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herbal",
            "expected": "herbal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 5,
            "latency_ms": 650.6,
            "usd": 0.001185,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "herbal",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herbal",
            "expected": "herbal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 5,
            "latency_ms": 833.6,
            "usd": 0.001185,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "herbal",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "herbal",
            "expected": "herbal",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 370,
            "output_tokens": 5,
            "latency_ms": 701.9,
            "usd": 0.001185,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 370.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 728.7,
        "usd_sum": 0.003555
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-018",
      "category": "vocabulary",
      "capability": "component_role_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "levantine",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.component_roles"
      ],
      "prompt_template_hash": "9e97b59361dbdf0e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Component roles\n\nmain_protein, main_vegetable, starch, sauce, fat_component, acid_component, aromatic_garnish, seasoning, textural_element\n\n---\n\nReturn the single OGS component role that best fits a drizzle of pomegranate molasses dressed onto a fattoush salad.\n\nINPUT:\n{\n  \"description\": \"pomegranate molasses drizzled over a fattoush salad at service\"\n}\n\nANSWER FORMAT: one_of:main_protein,main_vegetable,starch,sauce,fat_component,acid_component,aromatic_garnish,seasoning,textural_element",
      "expected": {
        "value": "acid_component"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "acid_component",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "acid_component",
            "expected": "acid_component",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 401,
            "output_tokens": 6,
            "latency_ms": 1598.3,
            "usd": 0.001293,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "acid_component",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "acid_component",
            "expected": "acid_component",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 401,
            "output_tokens": 6,
            "latency_ms": 7669.8,
            "usd": 0.001293,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "acid_component",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "acid_component",
            "expected": "acid_component",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 401,
            "output_tokens": 6,
            "latency_ms": 2048.7,
            "usd": 0.001293,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 401.0,
        "output_tokens_mean": 6.0,
        "latency_ms_mean": 3772.267,
        "usd_sum": 0.003879
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-019",
      "category": "vocabulary",
      "capability": "cooking_method_vocabulary",
      "difficulty": "medium",
      "cuisine_region": "filipino",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.cooking_methods"
      ],
      "prompt_template_hash": "fc9a763682f1225b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Cooking methods\n\nraw, seared, grilled, roasted, braised, poached, steamed, fried, confit, smoked, fermented, cured, pickled, dehydrated, sous_vide, blanched, sauteed, baked, charred, emulsified\n\n---\n\nPick the single OGS cooking method that best describes bagoong alamang (Philippine fermented salt-cured shrimp paste).\n\nINPUT:\n{\n  \"description\": \"bagoong alamang: salt-cured tiny shrimp left to ferment for 3-6 months\"\n}\n\nANSWER FORMAT: one_of:raw,seared,grilled,roasted,braised,poached,steamed,fried,confit,smoked,fermented,cured,pickled,dehydrated,sous_vide,blanched,sauteed,baked,charred,emulsified",
      "expected": {
        "value": "fermented"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "fermented",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fermented",
            "expected": "fermented",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 463,
            "output_tokens": 5,
            "latency_ms": 883.1,
            "usd": 0.001464,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "fermented",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fermented",
            "expected": "fermented",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 463,
            "output_tokens": 5,
            "latency_ms": 1174.7,
            "usd": 0.001464,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "fermented",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "fermented",
            "expected": "fermented",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 463,
            "output_tokens": 5,
            "latency_ms": 880.7,
            "usd": 0.001464,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 463.0,
        "output_tokens_mean": 5.0,
        "latency_ms_mean": 979.5,
        "usd_sum": 0.004392
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:vocab-020",
      "category": "vocabulary",
      "capability": "grade_scale",
      "difficulty": "medium",
      "cuisine_region": "global",
      "scorer": "exact_match",
      "context_needs": [
        "vocab.grade_scale"
      ],
      "prompt_template_hash": "56ad4f9c58bc1eb7",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### Grade scale (derived from 0-100 overall_score)\n\n- **A+**: 95-100\n- **A**: 90-94\n- **B+**: 85-89\n- **B**: 80-84\n- **C+**: 75-79\n- **C**: 70-74\n- **D**: 60-69\n- **F**: <60\n\n---\n\nUsing the OGS pairing grade scale, return the letter grade for a pairing whose `overall_score` is 64.\n\nINPUT:\n{\n  \"overall_score\": 64\n}\n\nANSWER FORMAT: one_of:A+,A,B+,B,C+,C,D,F",
      "expected": {
        "value": "D"
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "D",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "d",
            "expected": "d",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 414,
            "output_tokens": 4,
            "latency_ms": 1068.9,
            "usd": 0.001302,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "D",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "d",
            "expected": "d",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 414,
            "output_tokens": 4,
            "latency_ms": 1371.5,
            "usd": 0.001302,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "D",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": "d",
            "expected": "d",
            "aliases": []
          },
          "metrics": {
            "input_tokens": 414,
            "output_tokens": 4,
            "latency_ms": 1221.7,
            "usd": 0.001302,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 1.0,
      "score_summary": {
        "mean": 1.0,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 1.0,
        "ci95_hi": 1.0,
        "n": 3
      },
      "score_if_valid": 1.0,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 414.0,
        "output_tokens_mean": 4.0,
        "latency_ms_mean": 1220.7,
        "usd_sum": 0.003906
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-001",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "a75ad0ee2a385794",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nFrom the OGS explanation-code vocabulary, return the set of codes that best explain WHY the pairing below is excellent. Respond as a JSON array of code IDs. Pick 3 to 6 most relevant positive codes.\n\nINPUT:\n{\n  \"dish\": \"A5 wagyu with black truffle jus (very rich, high umami, earthy aromatics, full body)\",\n  \"beverage\": \"Barolo 2018 (Nebbiolo, high tannin, high acidity, full body, tar/forest-floor)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_TANNIN_FAT",
          "STRUCT_INTENSITY_MATCH",
          "AROMA_SHARED_EARTH",
          "STRUCT_BODY_ALIGN",
          "HARMONY_FLAVOR_CONVERGE"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_UMAMI_ENHANCE\",\"AROMA_SHARED_EARTH\",\"STRUCT_BODY_ALIGN\",\"STRUCT_ACIDITY_BALANCE\"]",
          "compliant": true,
          "score": 0.5455,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "expected": [
              "AROMA_SHARED_EARTH",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_FAT"
            ],
            "precision": 0.5,
            "recall": 0.6
          },
          "metrics": {
            "input_tokens": 1442,
            "output_tokens": 65,
            "latency_ms": 1933.3,
            "usd": 0.005301,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_UMAMI_ENHANCE\",\"AROMA_SHARED_EARTH\",\"STRUCT_BODY_ALIGN\",\"STRUCT_ACIDITY_BALANCE\"]",
          "compliant": true,
          "score": 0.5455,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "expected": [
              "AROMA_SHARED_EARTH",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_FAT"
            ],
            "precision": 0.5,
            "recall": 0.6
          },
          "metrics": {
            "input_tokens": 1442,
            "output_tokens": 65,
            "latency_ms": 2064.6,
            "usd": 0.005301,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_UMAMI_ENHANCE\",\"AROMA_SHARED_EARTH\",\"STRUCT_BODY_ALIGN\",\"STRUCT_ACIDITY_BALANCE\"]",
          "compliant": true,
          "score": 0.5455,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "expected": [
              "AROMA_SHARED_EARTH",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_FAT"
            ],
            "precision": 0.5,
            "recall": 0.6
          },
          "metrics": {
            "input_tokens": 1442,
            "output_tokens": 65,
            "latency_ms": 2995.9,
            "usd": 0.005301,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.5455,
      "score_summary": {
        "mean": 0.5455,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.5455,
        "ci95_hi": 0.5455,
        "n": 3
      },
      "score_if_valid": 0.5455,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1442.0,
        "output_tokens_mean": 65.0,
        "latency_ms_mean": 2331.267,
        "usd_sum": 0.015903
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-002",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "bdc5d8d74c85e9ab",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS explanation codes that best explain the success of the pairing below. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"fresh oysters on the half shell with lemon\",\n  \"beverage\": \"Chablis Premier Cru (Chardonnay, high acid, mineral, lean body)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_ACIDITY_BALANCE",
          "REFRESH_ACID",
          "REFRESH_MINERAL",
          "CONTRAST_RICH_CRISP",
          "HARMONY_REGIONAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_ACIDITY_BALANCE\",\"REFRESH_ACID\",\"REFRESH_MINERAL\",\"AROMA_SHARED_EARTH\",\"HARMONY_TEXTURE_MATCH\"]",
          "compliant": true,
          "score": 0.6,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "HARMONY_TEXTURE_MATCH",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "precision": 0.6,
            "recall": 0.6
          },
          "metrics": {
            "input_tokens": 1404,
            "output_tokens": 45,
            "latency_ms": 1576.3,
            "usd": 0.004887,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_ACIDITY_BALANCE\", \"REFRESH_ACID\", \"REFRESH_MINERAL\", \"AROMA_SHARED_EARTH\", \"HARMONY_TEXTURE_MATCH\"]",
          "compliant": true,
          "score": 0.6,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "HARMONY_TEXTURE_MATCH",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "precision": 0.6,
            "recall": 0.6
          },
          "metrics": {
            "input_tokens": 1404,
            "output_tokens": 49,
            "latency_ms": 1288.9,
            "usd": 0.004947,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_ACIDITY_BALANCE\", \"REFRESH_MINERAL\", \"REFRESH_ACID\", \"HARMONY_REGIONAL\", \"STRUCT_BODY_ALIGN\"]",
          "compliant": true,
          "score": 0.8,
          "detail": {
            "predicted": [
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "precision": 0.8,
            "recall": 0.8
          },
          "metrics": {
            "input_tokens": 1404,
            "output_tokens": 45,
            "latency_ms": 1932.3,
            "usd": 0.004887,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.1155,
        "sem": 0.0667,
        "ci95_lo": 0.536,
        "ci95_hi": 0.7973,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1404.0,
        "output_tokens_mean": 46.333,
        "latency_ms_mean": 1599.167,
        "usd_sum": 0.014721
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-003",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "3440bcf3978a1c58",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (2 to 4 codes).\n\nINPUT:\n{\n  \"dish\": \"crispy buttermilk fried chicken\",\n  \"beverage\": \"non-vintage Brut Champagne\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "REFRESH_EFFERVESCE",
          "REFRESH_ACID",
          "CONTRAST_RICH_CRISP"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"REFRESH_EFFERVESCE\",\"STRUCT_ACIDITY_BALANCE\",\"CONTRAST_RICH_CRISP\",\"STRUCT_BODY_ALIGN\"]",
          "compliant": true,
          "score": 0.5714,
          "detail": {
            "predicted": [
              "CONTRAST_RICH_CRISP",
              "REFRESH_EFFERVESCE",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "REFRESH_EFFERVESCE"
            ],
            "precision": 0.5,
            "recall": 0.6667
          },
          "metrics": {
            "input_tokens": 1389,
            "output_tokens": 42,
            "latency_ms": 1545.7,
            "usd": 0.004797,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"REFRESH_EFFERVESCE\", \"CONTRAST_RICH_CRISP\", \"STRUCT_ACIDITY_BALANCE\", \"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.8571,
          "detail": {
            "predicted": [
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "REFRESH_EFFERVESCE",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "REFRESH_EFFERVESCE"
            ],
            "precision": 0.75,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1389,
            "output_tokens": 42,
            "latency_ms": 1170.5,
            "usd": 0.004797,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"REFRESH_EFFERVESCE\", \"CONTRAST_RICH_CRISP\", \"STRUCT_ACIDITY_BALANCE\", \"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.8571,
          "detail": {
            "predicted": [
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "REFRESH_EFFERVESCE",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "REFRESH_EFFERVESCE"
            ],
            "precision": 0.75,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1389,
            "output_tokens": 42,
            "latency_ms": 1380.5,
            "usd": 0.004797,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.7619,
      "score_summary": {
        "mean": 0.7619,
        "stdev": 0.1649,
        "sem": 0.0952,
        "ci95_lo": 0.5752,
        "ci95_hi": 0.9485,
        "n": 3
      },
      "score_if_valid": 0.7619,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1389.0,
        "output_tokens_mean": 42.0,
        "latency_ms_mean": 1365.567,
        "usd_sum": 0.014391
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-004",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "hard",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "b3249dafda966f70",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS explanation codes that best explain the success of the pairing below. Respond as a JSON array (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"seared foie gras on brioche (very rich, fatty, slightly sweet-savoury)\",\n  \"beverage\": \"Sauternes (botrytised sweet white, high residual sugar, high acidity)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_SWEET_OFFSET",
          "STRUCT_ACIDITY_BALANCE",
          "CONTRAST_RICH_CRISP",
          "HARMONY_REGIONAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_ACIDITY_BALANCE\", \"CONTRAST_RICH_CRISP\", \"STRUCT_SWEET_OFFSET\", \"HARMONY_REGIONAL\", \"STRUCT_TANNIN_FAT\"]",
          "compliant": true,
          "score": 0.8889,
          "detail": {
            "predicted": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.8,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1409,
            "output_tokens": 54,
            "latency_ms": 1550.6,
            "usd": 0.005037,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_ACIDITY_BALANCE\", \"CONTRAST_RICH_CRISP\", \"STRUCT_SWEET_OFFSET\", \"HARMONY_REGIONAL\", \"STRUCT_BODY_ALIGN\"]",
          "compliant": true,
          "score": 0.8889,
          "detail": {
            "predicted": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.8,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1409,
            "output_tokens": 52,
            "latency_ms": 1332.5,
            "usd": 0.005007,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_ACIDITY_BALANCE\", \"CONTRAST_RICH_CRISP\", \"STRUCT_SWEET_OFFSET\", \"HARMONY_REGIONAL\", \"STRUCT_TANNIN_FAT\"]",
          "compliant": true,
          "score": 0.8889,
          "detail": {
            "predicted": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.8,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1409,
            "output_tokens": 54,
            "latency_ms": 1336.2,
            "usd": 0.005037,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.8889,
      "score_summary": {
        "mean": 0.8889,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.8889,
        "ci95_hi": 0.8889,
        "n": 3
      },
      "score_if_valid": 0.8889,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1409.0,
        "output_tokens_mean": 53.333,
        "latency_ms_mean": 1406.433,
        "usd_sum": 0.015081
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-005",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "0cb2f1946943a0e2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS explanation codes that best explain why the pairing works. Respond as a JSON array (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"charcoal-grilled lamb rack with rosemary, black pepper crust, charred edges\",\n  \"beverage\": \"Northern Rhone Syrah (Cote-Rotie) — black pepper, violet, smoked meat, moderate-high tannin\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "AROMA_SHARED_SPICE",
          "AROMA_SHARED_WOOD",
          "AROMA_SHARED_HERBAL",
          "STRUCT_TANNIN_FAT",
          "STRUCT_TANNIN_PROTEIN"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"AROMA_SHARED_SPICE\",\"AROMA_SHARED_WOOD\",\"STRUCT_TANNIN_PROTEIN\",\"HARMONY_REGIONAL\",\"STRUCT_INTENSITY_MATCH\"]",
          "compliant": true,
          "score": 0.6,
          "detail": {
            "predicted": [
              "AROMA_SHARED_SPICE",
              "AROMA_SHARED_WOOD",
              "HARMONY_REGIONAL",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "AROMA_SHARED_SPICE",
              "AROMA_SHARED_WOOD",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.6,
            "recall": 0.6
          },
          "metrics": {
            "input_tokens": 1419,
            "output_tokens": 51,
            "latency_ms": 1763.7,
            "usd": 0.005022,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"AROMA_SHARED_SPICE\",\"AROMA_SHARED_WOOD\",\"STRUCT_TANNIN_PROTEIN\",\"HARMONY_REGIONAL\",\"STRUCT_INTENSITY_MATCH\"]",
          "compliant": true,
          "score": 0.6,
          "detail": {
            "predicted": [
              "AROMA_SHARED_SPICE",
              "AROMA_SHARED_WOOD",
              "HARMONY_REGIONAL",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "AROMA_SHARED_SPICE",
              "AROMA_SHARED_WOOD",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.6,
            "recall": 0.6
          },
          "metrics": {
            "input_tokens": 1419,
            "output_tokens": 51,
            "latency_ms": 1402.3,
            "usd": 0.005022,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"AROMA_SHARED_SPICE\",\"AROMA_SHARED_WOOD\",\"STRUCT_TANNIN_PROTEIN\",\"HARMONY_REGIONAL\",\"STRUCT_INTENSITY_MATCH\"]",
          "compliant": true,
          "score": 0.6,
          "detail": {
            "predicted": [
              "AROMA_SHARED_SPICE",
              "AROMA_SHARED_WOOD",
              "HARMONY_REGIONAL",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "AROMA_SHARED_SPICE",
              "AROMA_SHARED_WOOD",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.6,
            "recall": 0.6
          },
          "metrics": {
            "input_tokens": 1419,
            "output_tokens": 51,
            "latency_ms": 1685.7,
            "usd": 0.005022,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6,
      "score_summary": {
        "mean": 0.6,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.6,
        "ci95_hi": 0.6,
        "n": 3
      },
      "score_if_valid": 0.6,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1419.0,
        "output_tokens_mean": 51.0,
        "latency_ms_mean": 1617.233,
        "usd_sum": 0.015066
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-006",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "9a46b0d141442ce3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"Pizza Margherita (tomato, mozzarella, basil)\",\n  \"beverage\": \"Chianti Classico (Sangiovese, high acid, medium tannin)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_ACIDITY_BALANCE",
          "HARMONY_REGIONAL",
          "STRUCT_INTENSITY_MATCH",
          "AROMA_SHARED_HERBAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"HARMONY_REGIONAL\", \"STRUCT_ACIDITY_BALANCE\", \"STRUCT_TANNIN_PROTEIN\", \"AROMA_SHARED_HERBAL\", \"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_INTENSITY_MATCH"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1407,
            "output_tokens": 52,
            "latency_ms": 1251.7,
            "usd": 0.005001,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"HARMONY_REGIONAL\", \"STRUCT_ACIDITY_BALANCE\", \"REFRESH_ACID\", \"STRUCT_TANNIN_PROTEIN\", \"AROMA_SHARED_HERBAL\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_INTENSITY_MATCH"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1407,
            "output_tokens": 52,
            "latency_ms": 1422.2,
            "usd": 0.005001,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"HARMONY_REGIONAL\", \"STRUCT_ACIDITY_BALANCE\", \"STRUCT_TANNIN_PROTEIN\", \"AROMA_SHARED_HERBAL\", \"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_INTENSITY_MATCH"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1407,
            "output_tokens": 52,
            "latency_ms": 1196.4,
            "usd": 0.005001,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.6667,
        "ci95_hi": 0.6667,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1407.0,
        "output_tokens_mean": 52.0,
        "latency_ms_mean": 1290.1,
        "usd_sum": 0.015003
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-007",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "hard",
      "cuisine_region": "chinese",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "497adc64f3303eb4",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"mapo tofu with Sichuan peppercorn\",\n  \"beverage\": \"off-dry German Spätlese Riesling\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_SWEET_OFFSET",
          "REFRESH_ACID",
          "STRUCT_INTENSITY_MATCH",
          "CONTRAST_SWEET_SALTY"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\", \"REFRESH_ACID\", \"CONTRAST_SWEET_SALTY\", \"STRUCT_INTENSITY_MATCH\", \"AROMA_SHARED_SPICE\"]",
          "compliant": true,
          "score": 0.8889,
          "detail": {
            "predicted": [
              "AROMA_SHARED_SPICE",
              "CONTRAST_SWEET_SALTY",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.8,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1398,
            "output_tokens": 53,
            "latency_ms": 1454.7,
            "usd": 0.004989,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\",\"REFRESH_ACID\",\"CONTRAST_SWEET_SALTY\",\"STRUCT_INTENSITY_MATCH\",\"AROMA_SHARED_SPICE\"]",
          "compliant": true,
          "score": 0.8889,
          "detail": {
            "predicted": [
              "AROMA_SHARED_SPICE",
              "CONTRAST_SWEET_SALTY",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.8,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1398,
            "output_tokens": 49,
            "latency_ms": 1348.6,
            "usd": 0.004929,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\", \"REFRESH_ACID\", \"CONTRAST_SWEET_SALTY\", \"STRUCT_INTENSITY_MATCH\", \"AROMA_SHARED_SPICE\"]",
          "compliant": true,
          "score": 0.8889,
          "detail": {
            "predicted": [
              "AROMA_SHARED_SPICE",
              "CONTRAST_SWEET_SALTY",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.8,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1398,
            "output_tokens": 53,
            "latency_ms": 1334.2,
            "usd": 0.004989,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.8889,
      "score_summary": {
        "mean": 0.8889,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.8889,
        "ci95_hi": 0.8889,
        "n": 3
      },
      "score_if_valid": 0.8889,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1398.0,
        "output_tokens_mean": 51.667,
        "latency_ms_mean": 1379.167,
        "usd_sum": 0.014907
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-008",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "2120f96a0444a50e",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"herb-crusted rack of lamb, medium-rare\",\n  \"beverage\": \"Northern Rhône Syrah\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_TANNIN_PROTEIN",
          "AROMA_SHARED_HERBAL",
          "STRUCT_BODY_ALIGN",
          "HARMONY_FLAVOR_CONVERGE"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"AROMA_SHARED_HERBAL\",\"AROMA_SHARED_EARTH\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_TANNIN_FAT\",\"HARMONY_REGIONAL\"]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.4,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1390,
            "output_tokens": 53,
            "latency_ms": 1371.9,
            "usd": 0.004965,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"AROMA_SHARED_HERBAL\",\"AROMA_SHARED_EARTH\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_TANNIN_FAT\",\"HARMONY_REGIONAL\"]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.4,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1390,
            "output_tokens": 53,
            "latency_ms": 1838.0,
            "usd": 0.004965,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"AROMA_SHARED_HERBAL\",\"AROMA_SHARED_EARTH\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_BODY_ALIGN\",\"STRUCT_INTENSITY_MATCH\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "AROMA_SHARED_HERBAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1390,
            "output_tokens": 54,
            "latency_ms": 1381.6,
            "usd": 0.00498,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.5185,
      "score_summary": {
        "mean": 0.5185,
        "stdev": 0.1283,
        "sem": 0.0741,
        "ci95_lo": 0.3733,
        "ci95_hi": 0.6637,
        "n": 3
      },
      "score_if_valid": 0.5185,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1390.0,
        "output_tokens_mean": 53.333,
        "latency_ms_mean": 1530.5,
        "usd_sum": 0.01491
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-009",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "hard",
      "cuisine_region": "thai",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "c077c498ceed5f73",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"Thai green curry with chicken and Thai basil\",\n  \"beverage\": \"off-dry New Zealand Sauvignon Blanc\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_SWEET_OFFSET",
          "AROMA_SHARED_HERBAL",
          "REFRESH_ACID",
          "STRUCT_INTENSITY_MATCH"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\", \"REFRESH_ACID\", \"AROMA_SHARED_HERBAL\", \"CONTRAST_RICH_CRISP\", \"STRUCT_ACIDITY_BALANCE\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_HERBAL",
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1392,
            "output_tokens": 54,
            "latency_ms": 1690.3,
            "usd": 0.004986,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\", \"REFRESH_ACID\", \"AROMA_SHARED_HERBAL\", \"CONTRAST_RICH_CRISP\", \"STRUCT_ACIDITY_BALANCE\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_HERBAL",
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1392,
            "output_tokens": 54,
            "latency_ms": 1631.2,
            "usd": 0.004986,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\",\"REFRESH_ACID\",\"AROMA_SHARED_HERBAL\",\"CONTRAST_RICH_CRISP\",\"STRUCT_ACIDITY_BALANCE\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_HERBAL",
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "REFRESH_ACID",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1392,
            "output_tokens": 50,
            "latency_ms": 1642.7,
            "usd": 0.004926,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.6667,
        "ci95_hi": 0.6667,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1392.0,
        "output_tokens_mean": 52.667,
        "latency_ms_mean": 1654.733,
        "usd_sum": 0.014898
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-010",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "956c6ce654c2e297",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"moules marinières\",\n  \"beverage\": \"Muscadet sur lie\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_ACIDITY_BALANCE",
          "REFRESH_ACID",
          "AROMA_SHARED_HERBAL",
          "HARMONY_REGIONAL",
          "REFRESH_MINERAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"HARMONY_REGIONAL\", \"STRUCT_ACIDITY_BALANCE\", \"REFRESH_ACID\", \"REFRESH_MINERAL\", \"HARMONY_TEXTURE_MATCH\"]",
          "compliant": true,
          "score": 0.8,
          "detail": {
            "predicted": [
              "HARMONY_REGIONAL",
              "HARMONY_TEXTURE_MATCH",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "precision": 0.8,
            "recall": 0.8
          },
          "metrics": {
            "input_tokens": 1382,
            "output_tokens": 45,
            "latency_ms": 1229.5,
            "usd": 0.004821,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"HARMONY_REGIONAL\",\"STRUCT_ACIDITY_BALANCE\",\"REFRESH_MINERAL\",\"REFRESH_ACID\",\"AROMA_SHARED_HERBAL\"]",
          "compliant": true,
          "score": 1.0,
          "detail": {
            "predicted": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "precision": 1.0,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1382,
            "output_tokens": 43,
            "latency_ms": 3462.2,
            "usd": 0.004791,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"HARMONY_REGIONAL\",\"STRUCT_ACIDITY_BALANCE\",\"REFRESH_MINERAL\",\"REFRESH_ACID\",\"HARMONY_TEXTURE_MATCH\"]",
          "compliant": true,
          "score": 0.8,
          "detail": {
            "predicted": [
              "HARMONY_REGIONAL",
              "HARMONY_TEXTURE_MATCH",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "precision": 0.8,
            "recall": 0.8
          },
          "metrics": {
            "input_tokens": 1382,
            "output_tokens": 41,
            "latency_ms": 1498.4,
            "usd": 0.004761,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.8667,
      "score_summary": {
        "mean": 0.8667,
        "stdev": 0.1155,
        "sem": 0.0667,
        "ci95_lo": 0.736,
        "ci95_hi": 0.9973,
        "n": 3
      },
      "score_if_valid": 0.8667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1382.0,
        "output_tokens_mean": 43.0,
        "latency_ms_mean": 2063.367,
        "usd_sum": 0.014373
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-011",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "hard",
      "cuisine_region": "mexican",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "1dfeb782059a4814",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"tacos al pastor with pineapple and chili\",\n  \"beverage\": \"dry sparkling rosé (Cava rosado)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_SWEET_OFFSET",
          "REFRESH_EFFERVESCE",
          "AROMA_SHARED_FRUIT",
          "CONTRAST_RICH_CRISP"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"REFRESH_EFFERVESCE\", \"STRUCT_ACIDITY_BALANCE\", \"CONTRAST_SWEET_ACID\", \"CONTRAST_RICH_CRISP\", \"AROMA_SHARED_FRUIT\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_RICH_CRISP",
              "CONTRAST_SWEET_ACID",
              "REFRESH_EFFERVESCE",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_RICH_CRISP",
              "REFRESH_EFFERVESCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1395,
            "output_tokens": 56,
            "latency_ms": 1370.3,
            "usd": 0.005025,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"REFRESH_EFFERVESCE\", \"STRUCT_ACIDITY_BALANCE\", \"CONTRAST_SWEET_ACID\", \"AROMA_SHARED_FRUIT\", \"CONTRAST_RICH_CRISP\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_RICH_CRISP",
              "CONTRAST_SWEET_ACID",
              "REFRESH_EFFERVESCE",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_RICH_CRISP",
              "REFRESH_EFFERVESCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1395,
            "output_tokens": 56,
            "latency_ms": 1365.0,
            "usd": 0.005025,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"REFRESH_EFFERVESCE\", \"STRUCT_ACIDITY_BALANCE\", \"CONTRAST_SWEET_ACID\", \"CONTRAST_RICH_CRISP\", \"AROMA_SHARED_FRUIT\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_RICH_CRISP",
              "CONTRAST_SWEET_ACID",
              "REFRESH_EFFERVESCE",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_RICH_CRISP",
              "REFRESH_EFFERVESCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1395,
            "output_tokens": 56,
            "latency_ms": 1666.6,
            "usd": 0.005025,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.6667,
        "ci95_hi": 0.6667,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1395.0,
        "output_tokens_mean": 56.0,
        "latency_ms_mean": 1467.3,
        "usd_sum": 0.015075
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-012",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "28df36c8ae155697",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"duck confit, crispy skin\",\n  \"beverage\": \"mature Burgundy Pinot Noir (10 years)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_TANNIN_FAT",
          "AROMA_SHARED_EARTH",
          "HARMONY_REGIONAL",
          "HARMONY_COMPLEXITY_MATCH"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_ACIDITY_BALANCE\",\"AROMA_SHARED_EARTH\",\"HARMONY_COMPLEXITY_MATCH\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "HARMONY_COMPLEXITY_MATCH",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_EARTH",
              "HARMONY_COMPLEXITY_MATCH",
              "HARMONY_REGIONAL",
              "STRUCT_TANNIN_FAT"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1392,
            "output_tokens": 55,
            "latency_ms": 1626.7,
            "usd": 0.005001,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_ACIDITY_BALANCE\",\"AROMA_SHARED_EARTH\",\"HARMONY_COMPLEXITY_MATCH\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "HARMONY_COMPLEXITY_MATCH",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_EARTH",
              "HARMONY_COMPLEXITY_MATCH",
              "HARMONY_REGIONAL",
              "STRUCT_TANNIN_FAT"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1392,
            "output_tokens": 55,
            "latency_ms": 1409.5,
            "usd": 0.005001,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_TANNIN_FAT\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_ACIDITY_BALANCE\",\"AROMA_SHARED_EARTH\",\"HARMONY_COMPLEXITY_MATCH\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "HARMONY_COMPLEXITY_MATCH",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_EARTH",
              "HARMONY_COMPLEXITY_MATCH",
              "HARMONY_REGIONAL",
              "STRUCT_TANNIN_FAT"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1392,
            "output_tokens": 55,
            "latency_ms": 2074.9,
            "usd": 0.005001,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.6667,
        "ci95_hi": 0.6667,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1392.0,
        "output_tokens_mean": 55.0,
        "latency_ms_mean": 1703.7,
        "usd_sum": 0.015003
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-013",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "spanish",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "18a20e7e30844c41",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"jamón ibérico de bellota, sliced thin\",\n  \"beverage\": \"chilled Fino sherry (Manzanilla)\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_UMAMI_ENHANCE",
          "HARMONY_REGIONAL",
          "AROMA_SHARED_SAVORY",
          "REFRESH_MINERAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"HARMONY_REGIONAL\",\"STRUCT_TANNIN_PROTEIN\",\"REFRESH_ACID\",\"REFRESH_MINERAL\",\"HARMONY_FLAVOR_CONVERGE\"]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "HARMONY_FLAVOR_CONVERGE",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_SAVORY",
              "HARMONY_REGIONAL",
              "REFRESH_MINERAL",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "precision": 0.4,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1396,
            "output_tokens": 43,
            "latency_ms": 1698.5,
            "usd": 0.004833,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"HARMONY_REGIONAL\",\"STRUCT_TANNIN_PROTEIN\",\"REFRESH_ACID\",\"REFRESH_MINERAL\",\"CONTRAST_RICH_CRISP\"]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_SAVORY",
              "HARMONY_REGIONAL",
              "REFRESH_MINERAL",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "precision": 0.4,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1396,
            "output_tokens": 43,
            "latency_ms": 1546.4,
            "usd": 0.004833,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"HARMONY_REGIONAL\",\"STRUCT_TANNIN_PROTEIN\",\"REFRESH_ACID\",\"REFRESH_MINERAL\",\"HARMONY_FLAVOR_CONVERGE\"]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "HARMONY_FLAVOR_CONVERGE",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_SAVORY",
              "HARMONY_REGIONAL",
              "REFRESH_MINERAL",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "precision": 0.4,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1396,
            "output_tokens": 43,
            "latency_ms": 1677.0,
            "usd": 0.004833,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.4444,
      "score_summary": {
        "mean": 0.4444,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.4444,
        "ci95_hi": 0.4444,
        "n": 3
      },
      "score_if_valid": 0.4444,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1396.0,
        "output_tokens_mean": 43.0,
        "latency_ms_mean": 1640.633,
        "usd_sum": 0.014499
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-014",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "korean",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "f9be4ef2853d67b3",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"bulgogi: soy-pear-garlic marinated grilled beef\",\n  \"beverage\": \"medium-bodied unoaked Merlot\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_TANNIN_PROTEIN",
          "STRUCT_SWEET_OFFSET",
          "AROMA_SHARED_FRUIT",
          "STRUCT_BODY_ALIGN"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_BODY_ALIGN\",\"STRUCT_ACIDITY_BALANCE\",\"AROMA_SHARED_FRUIT\",\"HARMONY_FLAVOR_CONVERGE\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_FRUIT",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1399,
            "output_tokens": 54,
            "latency_ms": 1439.6,
            "usd": 0.005007,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_TANNIN_FAT\",\"STRUCT_BODY_ALIGN\",\"AROMA_SHARED_FRUIT\",\"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_FRUIT",
              "REFRESH_ACID",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1399,
            "output_tokens": 51,
            "latency_ms": 2201.2,
            "usd": 0.004962,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_BODY_ALIGN\",\"STRUCT_INTENSITY_MATCH\",\"AROMA_SHARED_FRUIT\",\"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_FRUIT",
              "REFRESH_ACID",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1399,
            "output_tokens": 49,
            "latency_ms": 1590.9,
            "usd": 0.004932,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.6667,
        "ci95_hi": 0.6667,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1399.0,
        "output_tokens_mean": 51.333,
        "latency_ms_mean": 1743.9,
        "usd_sum": 0.014901
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-015",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "easy",
      "cuisine_region": "british",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "5474f731e5d9751c",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"Stilton blue cheese\",\n  \"beverage\": \"Vintage Port\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "CONTRAST_SWEET_SALTY",
          "STRUCT_INTENSITY_MATCH",
          "HARMONY_COMPLEXITY_MATCH",
          "STRUCT_SWEET_OFFSET"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"CONTRAST_SWEET_SALTY\", \"STRUCT_SWEET_OFFSET\", \"STRUCT_TANNIN_FAT\", \"HARMONY_FLAVOR_CONVERGE\", \"STRUCT_INTENSITY_MATCH\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_COMPLEXITY_MATCH",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1380,
            "output_tokens": 57,
            "latency_ms": 1291.4,
            "usd": 0.004995,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"CONTRAST_SWEET_SALTY\",\"STRUCT_SWEET_OFFSET\",\"STRUCT_TANNIN_FAT\",\"HARMONY_FLAVOR_CONVERGE\",\"STRUCT_INTENSITY_MATCH\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_COMPLEXITY_MATCH",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1380,
            "output_tokens": 53,
            "latency_ms": 1460.2,
            "usd": 0.004935,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"CONTRAST_SWEET_SALTY\",\"STRUCT_SWEET_OFFSET\",\"HARMONY_FLAVOR_CONVERGE\",\"STRUCT_INTENSITY_MATCH\",\"HARMONY_COMPLEXITY_MATCH\"]",
          "compliant": true,
          "score": 0.8889,
          "detail": {
            "predicted": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_COMPLEXITY_MATCH",
              "HARMONY_FLAVOR_CONVERGE",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_COMPLEXITY_MATCH",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.8,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1380,
            "output_tokens": 51,
            "latency_ms": 1302.9,
            "usd": 0.004905,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.7408,
      "score_summary": {
        "mean": 0.7408,
        "stdev": 0.1283,
        "sem": 0.0741,
        "ci95_lo": 0.5956,
        "ci95_hi": 0.8859,
        "n": 3
      },
      "score_if_valid": 0.7408,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1380.0,
        "output_tokens_mean": 53.667,
        "latency_ms_mean": 1351.5,
        "usd_sum": 0.014835
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-016",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "hard",
      "cuisine_region": "japanese",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "fc7f227aebed6756",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"shrimp tempura, light fry\",\n  \"beverage\": \"chilled junmai ginjo sake\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_UMAMI_ENHANCE",
          "REFRESH_MINERAL",
          "AROMA_SHARED_FLORAL",
          "STRUCT_INTENSITY_MATCH",
          "HARMONY_REGIONAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_BODY_ALIGN\",\"REFRESH_ACID\",\"CONTRAST_RICH_CRISP\",\"REFRESH_MINERAL\",\"HARMONY_TEXTURE_MATCH\"]",
          "compliant": true,
          "score": 0.2,
          "detail": {
            "predicted": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_TEXTURE_MATCH",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_BODY_ALIGN"
            ],
            "expected": [
              "AROMA_SHARED_FLORAL",
              "HARMONY_REGIONAL",
              "REFRESH_MINERAL",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "precision": 0.2,
            "recall": 0.2
          },
          "metrics": {
            "input_tokens": 1388,
            "output_tokens": 44,
            "latency_ms": 1281.8,
            "usd": 0.004824,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_BODY_ALIGN\",\"REFRESH_ACID\",\"CONTRAST_RICH_CRISP\",\"HARMONY_TEXTURE_MATCH\",\"REFRESH_MINERAL\"]",
          "compliant": true,
          "score": 0.2,
          "detail": {
            "predicted": [
              "CONTRAST_RICH_CRISP",
              "HARMONY_TEXTURE_MATCH",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_BODY_ALIGN"
            ],
            "expected": [
              "AROMA_SHARED_FLORAL",
              "HARMONY_REGIONAL",
              "REFRESH_MINERAL",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "precision": 0.2,
            "recall": 0.2
          },
          "metrics": {
            "input_tokens": 1388,
            "output_tokens": 44,
            "latency_ms": 1375.5,
            "usd": 0.004824,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_BODY_ALIGN\",\"REFRESH_ACID\",\"CONTRAST_RICH_CRISP\",\"AROMA_SHARED_FLORAL\",\"REFRESH_MINERAL\"]",
          "compliant": true,
          "score": 0.4,
          "detail": {
            "predicted": [
              "AROMA_SHARED_FLORAL",
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_BODY_ALIGN"
            ],
            "expected": [
              "AROMA_SHARED_FLORAL",
              "HARMONY_REGIONAL",
              "REFRESH_MINERAL",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_UMAMI_ENHANCE"
            ],
            "precision": 0.4,
            "recall": 0.4
          },
          "metrics": {
            "input_tokens": 1388,
            "output_tokens": 46,
            "latency_ms": 1817.5,
            "usd": 0.004854,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.2667,
      "score_summary": {
        "mean": 0.2667,
        "stdev": 0.1155,
        "sem": 0.0667,
        "ci95_lo": 0.136,
        "ci95_hi": 0.3973,
        "n": 3
      },
      "score_if_valid": 0.2667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1388.0,
        "output_tokens_mean": 44.667,
        "latency_ms_mean": 1491.6,
        "usd_sum": 0.014502
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-017",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "greek",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "024198497e327bb1",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"horiatiki salad with feta, tomato, olive, oregano\",\n  \"beverage\": \"Santorini Assyrtiko\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "REFRESH_ACID",
          "REFRESH_MINERAL",
          "HARMONY_REGIONAL",
          "AROMA_SHARED_HERBAL"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"HARMONY_REGIONAL\", \"STRUCT_ACIDITY_BALANCE\", \"REFRESH_MINERAL\", \"AROMA_SHARED_HERBAL\", \"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.8889,
          "detail": {
            "predicted": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL"
            ],
            "precision": 0.8,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1396,
            "output_tokens": 47,
            "latency_ms": 1592.1,
            "usd": 0.004893,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"HARMONY_REGIONAL\", \"STRUCT_ACIDITY_BALANCE\", \"REFRESH_MINERAL\", \"AROMA_SHARED_HERBAL\", \"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.8889,
          "detail": {
            "predicted": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL"
            ],
            "precision": 0.8,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1396,
            "output_tokens": 47,
            "latency_ms": 1501.0,
            "usd": 0.004893,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"HARMONY_REGIONAL\", \"STRUCT_ACIDITY_BALANCE\", \"REFRESH_MINERAL\", \"AROMA_SHARED_HERBAL\", \"REFRESH_ACID\"]",
          "compliant": true,
          "score": 0.8889,
          "detail": {
            "predicted": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL",
              "STRUCT_ACIDITY_BALANCE"
            ],
            "expected": [
              "AROMA_SHARED_HERBAL",
              "HARMONY_REGIONAL",
              "REFRESH_ACID",
              "REFRESH_MINERAL"
            ],
            "precision": 0.8,
            "recall": 1.0
          },
          "metrics": {
            "input_tokens": 1396,
            "output_tokens": 47,
            "latency_ms": 1437.7,
            "usd": 0.004893,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.8889,
      "score_summary": {
        "mean": 0.8889,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.8889,
        "ci95_hi": 0.8889,
        "n": 3
      },
      "score_if_valid": 0.8889,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1396.0,
        "output_tokens_mean": 47.0,
        "latency_ms_mean": 1510.267,
        "usd_sum": 0.014679
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-018",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "hard",
      "cuisine_region": "chinese",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "283bbb9e703e6e30",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"Peking duck with pancake, scallion, hoisin\",\n  \"beverage\": \"10-year-old off-dry Mosel Riesling\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_SWEET_OFFSET",
          "AROMA_SHARED_FRUIT",
          "REFRESH_ACID",
          "HARMONY_COMPLEXITY_MATCH"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\", \"REFRESH_ACID\", \"CONTRAST_RICH_CRISP\", \"STRUCT_ACIDITY_BALANCE\", \"AROMA_SHARED_FRUIT\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "HARMONY_COMPLEXITY_MATCH",
              "REFRESH_ACID",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1402,
            "output_tokens": 53,
            "latency_ms": 4532.5,
            "usd": 0.005001,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\", \"REFRESH_ACID\", \"CONTRAST_RICH_CRISP\", \"STRUCT_TANNIN_FAT\", \"AROMA_SHARED_FRUIT\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "STRUCT_SWEET_OFFSET",
              "STRUCT_TANNIN_FAT"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "HARMONY_COMPLEXITY_MATCH",
              "REFRESH_ACID",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1402,
            "output_tokens": 54,
            "latency_ms": 1550.8,
            "usd": 0.005016,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"STRUCT_SWEET_OFFSET\", \"REFRESH_ACID\", \"STRUCT_ACIDITY_BALANCE\", \"CONTRAST_RICH_CRISP\", \"AROMA_SHARED_FRUIT\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_FRUIT",
              "CONTRAST_RICH_CRISP",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "AROMA_SHARED_FRUIT",
              "HARMONY_COMPLEXITY_MATCH",
              "REFRESH_ACID",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1402,
            "output_tokens": 53,
            "latency_ms": 5853.8,
            "usd": 0.005001,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.6667,
        "ci95_hi": 0.6667,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1402.0,
        "output_tokens_mean": 53.333,
        "latency_ms_mean": 3979.033,
        "usd_sum": 0.015018
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-019",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "medium",
      "cuisine_region": "italian",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "9ea6cd79384f9ee2",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"Sunday ragù napoletano (long-cooked beef and pork)\",\n  \"beverage\": \"Campanian Aglianico, 5 years\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_TANNIN_PROTEIN",
          "HARMONY_REGIONAL",
          "STRUCT_BODY_ALIGN",
          "AROMA_SHARED_WOOD"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"HARMONY_REGIONAL\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_TANNIN_FAT\",\"STRUCT_INTENSITY_MATCH\",\"STRUCT_BODY_ALIGN\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_WOOD",
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1400,
            "output_tokens": 50,
            "latency_ms": 3370.1,
            "usd": 0.00495,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"HARMONY_REGIONAL\",\"STRUCT_TANNIN_PROTEIN\",\"STRUCT_TANNIN_FAT\",\"STRUCT_BODY_ALIGN\",\"STRUCT_INTENSITY_MATCH\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_INTENSITY_MATCH",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_WOOD",
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1400,
            "output_tokens": 50,
            "latency_ms": 2460.3,
            "usd": 0.00495,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"HARMONY_REGIONAL\", \"STRUCT_TANNIN_PROTEIN\", \"STRUCT_TANNIN_FAT\", \"STRUCT_BODY_ALIGN\", \"AROMA_SHARED_EARTH\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "AROMA_SHARED_EARTH",
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_FAT",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "expected": [
              "AROMA_SHARED_WOOD",
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_TANNIN_PROTEIN"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1400,
            "output_tokens": 55,
            "latency_ms": 1310.9,
            "usd": 0.005025,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.6667,
      "score_summary": {
        "mean": 0.6667,
        "stdev": 0.0,
        "sem": 0.0,
        "ci95_lo": 0.6667,
        "ci95_hi": 0.6667,
        "n": 3
      },
      "score_if_valid": 0.6667,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1400.0,
        "output_tokens_mean": 51.667,
        "latency_ms_mean": 2380.433,
        "usd_sum": 0.014925
      }
    },
    {
      "task_id": "ogs:core:benchmark-task:xplain-020",
      "category": "explanation",
      "capability": "explanation_code_selection",
      "difficulty": "easy",
      "cuisine_region": "french",
      "scorer": "set_f1",
      "context_needs": [
        "vocab.explanation_codes.positive"
      ],
      "prompt_template_hash": "43005f44c870684b",
      "prompt": "You are a gastronomy expert evaluated by the Open Gastronomy Standard (OGS) Benchmark.\n\nYou will be given:\n- The relevant slices of the OGS specification (controlled vocabularies, scales,\n  JSON schemas) needed to answer the task correctly.\n- A task instruction, an input payload, and an answer format.\n\nFollow these rules:\n- Use ONLY the OGS controlled vocabularies supplied below. Do not invent new\n  enum values, tag names, or code IDs.\n- When the task asks for JSON, respond with a single JSON document and nothing\n  else. Do NOT wrap JSON in markdown code fences.\n- When the task asks for a single token (e.g. 'a', '3', 'fruit'), respond with\n  just the token and no surrounding prose.\n- Intensities use the OGS 0-10 scale unless otherwise stated\n  (0 = not detectable, 10 = overwhelming).\n- If the task supplies a list of candidates, your answer must come from that\n  list exactly.\n\n## OGS Context\n\n### OGS Explanation Codes (positive only)\n\nEach code has a fixed `id` (UPPER_SNAKE_CASE). Use the exact `id` string.\n\n**aromatic_bridge**\n- `AROMA_SHARED_FRUIT` — Shared Fruit Aromatics: Both dish and beverage share fruit aromatic families, creating an aromatic bridge.\n- `AROMA_SHARED_SPICE` — Shared Spice Notes: Spice aromatics appear in both dish and beverage, linking them aromatically.\n- `AROMA_SHARED_EARTH` — Shared Earth Notes: Earthy, mineral, or fungal aromatics bridge the dish and beverage.\n- `AROMA_SHARED_HERBAL` — Shared Herbal Notes: Herbal aromatics present in both dish and beverage create continuity.\n- `AROMA_SHARED_FLORAL` — Shared Floral Notes: Floral aromatics appear in both elements, creating an elegant aromatic link.\n- `AROMA_SHARED_WOOD` — Shared Wood/Oak Notes: Wood, smoke, or oak aromatics bridge dish and beverage (e.g., grilled meat with oaked wine).\n\n**contrast**\n- `CONTRAST_SWEET_ACID` — Sweet-Acid Contrast: Sweetness in one element contrasts with acidity in the other, creating dynamic tension.\n- `CONTRAST_RICH_CRISP` — Rich-Crisp Contrast: A rich, heavy dish is contrasted by a crisp, lean beverage (or vice versa).\n- `CONTRAST_SWEET_SALTY` — Sweet-Salty Contrast: Sweetness in the beverage contrasts with saltiness in the dish, creating interest.\n- `CONTRAST_TEMP_TEXTURE` — Temperature-Texture Contrast: Temperature or textural differences between dish and beverage create pleasant contrast.\n\n**harmony**\n- `HARMONY_FLAVOR_CONVERGE` — Flavor Profile Convergence: The overall flavor profiles of dish and beverage converge, creating a unified taste experience.\n- `HARMONY_TEXTURE_MATCH` — Textural Harmony: The textures of dish and beverage complement each other harmoniously.\n- `HARMONY_REGIONAL` — Regional Affinity: Dish and beverage share a regional origin, reflecting terroir-driven compatibility.\n- `HARMONY_COMPLEXITY_MATCH` — Complexity Match: Both dish and beverage have similar complexity levels, so neither outshines the other.\n\n**positive_structural**\n- `STRUCT_INTENSITY_MATCH` — Intensity Match: The overall flavor intensity of the dish and beverage are well-aligned, so neither overpowers the other.\n- `STRUCT_BODY_ALIGN` — Body Alignment: The body/weight of the dish matches the body of the beverage, creating a balanced mouthfeel.\n- `STRUCT_ACIDITY_BALANCE` — Acidity Balance: The beverage's acidity cuts through richness or matches the dish's acidity, creating freshness.\n- `STRUCT_TANNIN_FAT` — Tannin-Fat Synergy: Tannins in the beverage are softened by fat in the dish, creating a smooth, integrated sensation.\n- `STRUCT_TANNIN_PROTEIN` — Tannin-Protein Synergy: Proteins in the dish bind with tannins, reducing astringency and enhancing mouthfeel.\n- `STRUCT_SWEET_OFFSET` — Sweetness Offset: Residual sweetness in the beverage balances salt, spice, or acidity in the dish.\n- `STRUCT_UMAMI_ENHANCE` — Umami Enhancement: Umami in the dish enhances the perceived fruit and body of the beverage.\n- `STRUCT_ALCOHOL_WARMTH` — Alcohol Warmth Complement: The warmth of alcohol complements rich, hearty dishes without overpowering.\n\n**refresh**\n- `REFRESH_EFFERVESCE` — Effervescence Cleanse: Carbonation in the beverage physically scrubs the palate between bites of rich food.\n- `REFRESH_ACID` — Acid Refresh: High acidity in the beverage refreshes the palate, cutting through fat and richness.\n- `REFRESH_MINERAL` — Mineral Reset: Mineral character in the beverage provides a clean, palate-resetting effect.\n\n---\n\nReturn the OGS positive explanation codes that best explain why the pairing below works. Respond as a JSON array of code IDs (3 to 5 codes).\n\nINPUT:\n{\n  \"dish\": \"foie gras torchon with brioche\",\n  \"beverage\": \"Sauternes\"\n}\n\nANSWER FORMAT: set_of_codes",
      "expected": {
        "values": [
          "STRUCT_SWEET_OFFSET",
          "CONTRAST_SWEET_SALTY",
          "HARMONY_REGIONAL",
          "STRUCT_BODY_ALIGN"
        ]
      },
      "samples": [
        {
          "sample": 0,
          "raw_response": "[\"CONTRAST_SWEET_SALTY\",\"STRUCT_SWEET_OFFSET\",\"REFRESH_ACID\",\"STRUCT_ACIDITY_BALANCE\",\"HARMONY_TEXTURE_MATCH\"]",
          "compliant": true,
          "score": 0.4444,
          "detail": {
            "predicted": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_TEXTURE_MATCH",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.4,
            "recall": 0.5
          },
          "metrics": {
            "input_tokens": 1386,
            "output_tokens": 48,
            "latency_ms": 2601.0,
            "usd": 0.004878,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 1,
          "raw_response": "[\"CONTRAST_SWEET_SALTY\",\"STRUCT_SWEET_OFFSET\",\"STRUCT_ACIDITY_BALANCE\",\"REFRESH_ACID\",\"STRUCT_BODY_ALIGN\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "CONTRAST_SWEET_SALTY",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1386,
            "output_tokens": 48,
            "latency_ms": 1711.3,
            "usd": 0.004878,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        },
        {
          "sample": 2,
          "raw_response": "[\"CONTRAST_SWEET_SALTY\",\"STRUCT_SWEET_OFFSET\",\"STRUCT_ACIDITY_BALANCE\",\"REFRESH_ACID\",\"STRUCT_BODY_ALIGN\"]",
          "compliant": true,
          "score": 0.6667,
          "detail": {
            "predicted": [
              "CONTRAST_SWEET_SALTY",
              "REFRESH_ACID",
              "STRUCT_ACIDITY_BALANCE",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET"
            ],
            "expected": [
              "CONTRAST_SWEET_SALTY",
              "HARMONY_REGIONAL",
              "STRUCT_BODY_ALIGN",
              "STRUCT_SWEET_OFFSET"
            ],
            "precision": 0.6,
            "recall": 0.75
          },
          "metrics": {
            "input_tokens": 1386,
            "output_tokens": 48,
            "latency_ms": 3243.9,
            "usd": 0.004878,
            "model_version": "claude-sonnet-4-6",
            "finish_reason": "end_turn"
          }
        }
      ],
      "score": 0.5926,
      "score_summary": {
        "mean": 0.5926,
        "stdev": 0.1283,
        "sem": 0.0741,
        "ci95_lo": 0.4474,
        "ci95_hi": 0.7378,
        "n": 3
      },
      "score_if_valid": 0.5926,
      "format_rate": 1.0,
      "metrics": {
        "input_tokens_mean": 1386.0,
        "output_tokens_mean": 48.0,
        "latency_ms_mean": 2518.733,
        "usd_sum": 0.014634
      }
    }
  ],
  "category_scores": {
    "composition": {
      "mean_score": 0.9219,
      "ci95_lo": 0.8206,
      "ci95_hi": 1.0233,
      "stdev": 0.2313,
      "task_count": 20,
      "weight": 0.12,
      "format_rate": 1.0,
      "score_if_valid": 0.9219
    },
    "dietary": {
      "mean_score": 0.4733,
      "ci95_lo": 0.2287,
      "ci95_hi": 0.7178,
      "stdev": 0.4322,
      "task_count": 12,
      "weight": 0.06,
      "format_rate": 1.0,
      "score_if_valid": 0.4733
    },
    "explanation": {
      "mean_score": 0.6669,
      "ci95_lo": 0.5988,
      "ci95_hi": 0.735,
      "stdev": 0.1554,
      "task_count": 20,
      "weight": 0.1,
      "format_rate": 1.0,
      "score_if_valid": 0.6669
    },
    "menu_engineering": {
      "mean_score": 0.9583,
      "ci95_lo": 0.8767,
      "ci95_hi": 1.04,
      "stdev": 0.1178,
      "task_count": 8,
      "weight": 0.03,
      "format_rate": 1.0,
      "score_if_valid": 0.9583
    },
    "nutrition": {
      "mean_score": 0.4462,
      "ci95_lo": 0.1393,
      "ci95_hi": 0.7532,
      "stdev": 0.4952,
      "task_count": 10,
      "weight": 0.03,
      "format_rate": 1.0,
      "score_if_valid": 0.4462
    },
    "pairing": {
      "mean_score": 1.0,
      "ci95_lo": 1.0,
      "ci95_hi": 1.0,
      "stdev": 0.0,
      "task_count": 22,
      "weight": 0.15,
      "format_rate": 1.0,
      "score_if_valid": 1.0
    },
    "regional": {
      "mean_score": 0.9583,
      "ci95_lo": 0.8767,
      "ci95_hi": 1.04,
      "stdev": 0.2041,
      "task_count": 24,
      "weight": 0.05,
      "format_rate": 1.0,
      "score_if_valid": 0.9583
    },
    "risk": {
      "mean_score": 0.8667,
      "ci95_lo": 0.7213,
      "ci95_hi": 1.012,
      "stdev": 0.3316,
      "task_count": 20,
      "weight": 0.08,
      "format_rate": 1.0,
      "score_if_valid": 0.8667
    },
    "safety": {
      "mean_score": 0.9808,
      "ci95_lo": 0.9431,
      "ci95_hi": 1.0185,
      "stdev": 0.0666,
      "task_count": 12,
      "weight": 0.06,
      "format_rate": 1.0,
      "score_if_valid": 0.9808
    },
    "sensory": {
      "mean_score": 0.5837,
      "ci95_lo": 0.4386,
      "ci95_hi": 0.7289,
      "stdev": 0.3394,
      "task_count": 21,
      "weight": 0.12,
      "format_rate": 1.0,
      "score_if_valid": 0.5837
    },
    "structured_output": {
      "mean_score": 0.6833,
      "ci95_lo": 0.4796,
      "ci95_hi": 0.8871,
      "stdev": 0.4648,
      "task_count": 20,
      "weight": 0.12,
      "format_rate": 0.9333,
      "score_if_valid": 0.6833
    },
    "vocabulary": {
      "mean_score": 0.9565,
      "ci95_lo": 0.8713,
      "ci95_hi": 1.0417,
      "stdev": 0.2085,
      "task_count": 23,
      "weight": 0.08,
      "format_rate": 1.0,
      "score_if_valid": 0.9565
    }
  },
  "aggregate": {
    "score": 80.25,
    "ci95_lo": 76.24,
    "ci95_hi": 84.26,
    "scale": "0-100",
    "partial": false,
    "missing_categories": []
  },
  "metrics": {
    "latency_ms_mean": 2624.494,
    "latency_ms_median": 1280.567,
    "input_tokens_mean": 877.0,
    "output_tokens_mean": 109.9,
    "usd_total": 2.734734,
    "tasks_with_usd": 213
  },
  "weights": {
    "vocabulary": 0.08,
    "sensory": 0.12,
    "composition": 0.12,
    "pairing": 0.15,
    "explanation": 0.1,
    "risk": 0.08,
    "structured_output": 0.12,
    "regional": 0.05,
    "dietary": 0.06,
    "safety": 0.06,
    "nutrition": 0.03,
    "menu_engineering": 0.03
  },
  "judge": {
    "judge_model": "gpt-5.4",
    "judge_prompt_hash": "756db6a487714bca",
    "judge_validated": false,
    "excluded_task_ids": [
      "ogs:core:benchmark-task:menu-009"
    ],
    "note": "Rubric-judge tasks are scored and retained in `tasks`, but excluded from `aggregate` until the judge clears the human-validation protocol (r > 0.8 against a 50-task sample)."
  }
}