{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://opengastronomy.org/schema/v0.1/benchmark-result.schema.json",
  "title": "OGS Benchmark Result",
  "description": "Output of a benchmark run against a specific model.",
  "type": "object",
  "required": ["benchmark_version", "ogs_version", "model", "run_at", "tasks", "category_scores", "aggregate"],
  "properties": {
    "benchmark_version": { "type": "string", "pattern": "^\\d+\\.\\d+\\.\\d+$" },
    "ogs_version": { "type": "string", "const": "0.1.0" },
    "run_id": { "type": "string" },
    "run_at": { "type": "string", "format": "date-time" },
    "model": {
      "type": "object",
      "required": ["id"],
      "properties": {
        "id": { "type": "string", "minLength": 1 },
        "adapter": { "type": "string" },
        "provider": { "type": "string" },
        "options": { "type": "object", "additionalProperties": true }
      },
      "additionalProperties": true
    },
    "tasks": {
      "type": "array",
      "items": {
        "type": "object",
        "required": ["task_id", "category", "score"],
        "properties": {
          "task_id": { "type": "string" },
          "category": { "type": "string" },
          "capability": { "type": "string" },
          "difficulty": { "type": "string" },
          "score": { "type": "number", "minimum": 0, "maximum": 1 },
          "prompt": { "type": "string" },
          "raw_response": { "type": "string" },
          "parsed_response": {},
          "expected": {},
          "scorer": { "type": "string" },
          "error": { "type": "string" }
        },
        "additionalProperties": true
      }
    },
    "category_scores": {
      "type": "object",
      "patternProperties": {
        "^[a-z_]+$": {
          "type": "object",
          "required": ["mean_score", "task_count"],
          "properties": {
            "mean_score": { "type": "number", "minimum": 0, "maximum": 1 },
            "task_count": { "type": "integer", "minimum": 0 },
            "weight": { "type": "number", "minimum": 0 }
          },
          "additionalProperties": true
        }
      },
      "additionalProperties": false
    },
    "aggregate": {
      "type": "object",
      "required": ["score", "scale"],
      "properties": {
        "score": { "type": "number", "minimum": 0, "maximum": 100 },
        "ci95_lo": { "type": "number", "minimum": 0, "maximum": 100 },
        "ci95_hi": { "type": "number", "minimum": 0, "maximum": 100 },
        "scale": { "type": "string", "const": "0-100" },
        "partial": { "type": "boolean" },
        "missing_categories": {
          "type": "array",
          "items": { "type": "string" }
        }
      },
      "additionalProperties": true
    },
    "config": {
      "type": "object",
      "properties": {
        "samples": { "type": "integer", "minimum": 1 },
        "closed_book": { "type": "boolean" },
        "seed": { "type": ["integer", "null"] },
        "skip_disputed": { "type": "boolean" }
      },
      "additionalProperties": true
    },
    "reproducibility": {
      "type": "object",
      "properties": {
        "task_set_sha": { "type": "string" },
        "task_count": { "type": "integer" },
        "prompt_hashes": { "type": "object", "additionalProperties": { "type": "string" } }
      },
      "additionalProperties": true
    },
    "metrics": {
      "type": "object",
      "additionalProperties": true
    },
    "environment": {
      "type": "object",
      "additionalProperties": true
    },
    "judge": {
      "type": "object",
      "properties": {
        "judge_model": { "type": "string" },
        "judge_prompt_hash": { "type": "string" },
        "judge_validated": { "type": "boolean" },
        "excluded_task_ids": {
          "type": "array",
          "items": { "type": "string" }
        },
        "note": { "type": "string" }
      },
      "additionalProperties": true
    },
    "weights": {
      "type": "object",
      "additionalProperties": { "type": "number" }
    }
  },
  "additionalProperties": true
}
