{
  "schema": {
    "system": "name of the system under test",
    "dataset": "dataset directory name (e.g. cases, primock57)",
    "claimLevel": "powered for ranked benchmark rows; smoke for illustrative/demo rows that must not be ranked",
    "n": "number of cases scored",
    "narrativeMean": "0-100, higher better",
    "fidelityMean": "1-5 input-fidelity dimension mean, higher better",
    "dangerousFabricationRate": "0-1, lower better (PRIMARY RANK KEY)",
    "leakRate": "0-1, lower better",
    "perDimension": "mean of each of the 6 narrative dimensions",
    "judgeModel": "the judge model used (materially affects scores — required)",
    "scoredAt": "ISO date the submitter ran the benchmark"
  },
  "results": [
    {
      "system": "claude-sonnet (scribe)",
      "dataset": "primock57",
      "claimLevel": "powered",
      "n": 57,
      "nErrored": 0,
      "repeats": 2,
      "narrativeMean": 78.4,
      "narrativeMeanCI": [
        76.46,
        80.43
      ],
      "dangerousFabricationRate": 0.053,
      "dangerousFabricationRateCI": [
        0,
        0.12
      ],
      "leakRate": 0,
      "fidelityMean": 4.46,
      "perDimension": {
        "storyCohesion": 4.34,
        "clinicalCompleteness": 3.13,
        "naturalFlow": 4.33,
        "absenceOfArtifacts": 4.39,
        "physicianReadability": 4.15,
        "inputFidelity": 4.46
      },
      "judgeModel": "opus",
      "scoredAt": "2026-06-02",
      "notesPublished": false,
      "note": "Scores-only baseline (closed model). Generic scribe prompt, not a tuned system. n=57 PriMock57 consults, repeats=2."
    },
    {
      "system": "gpt-4.1 (scribe)",
      "dataset": "primock57",
      "claimLevel": "powered",
      "n": 57,
      "nErrored": 0,
      "repeats": 2,
      "narrativeMean": 73.6,
      "narrativeMeanCI": [
        72.25,
        74.9
      ],
      "dangerousFabricationRate": 0.053,
      "dangerousFabricationRateCI": [
        0,
        0.12
      ],
      "leakRate": 0,
      "fidelityMean": 4.27,
      "perDimension": {
        "storyCohesion": 4.04,
        "clinicalCompleteness": 3.16,
        "naturalFlow": 3.97,
        "absenceOfArtifacts": 4.2,
        "physicianReadability": 3.99,
        "inputFidelity": 4.27
      },
      "judgeModel": "opus",
      "scoredAt": "2026-06-02",
      "notesPublished": false,
      "note": "Scores-only baseline (closed model). Generic scribe prompt, not a tuned system. n=57 PriMock57 consults, repeats=2."
    },
    {
      "system": "gpt-4o (scribe)",
      "dataset": "primock57",
      "claimLevel": "powered",
      "n": 57,
      "nErrored": 0,
      "repeats": 2,
      "narrativeMean": 67.4,
      "narrativeMeanCI": [
        65.91,
        68.72
      ],
      "dangerousFabricationRate": 0.088,
      "dangerousFabricationRateCI": [
        0.02,
        0.18
      ],
      "leakRate": 0,
      "fidelityMean": 3.91,
      "perDimension": {
        "storyCohesion": 3.99,
        "clinicalCompleteness": 2.76,
        "naturalFlow": 3.54,
        "absenceOfArtifacts": 4.04,
        "physicianReadability": 3.87,
        "inputFidelity": 3.91
      },
      "judgeModel": "opus",
      "scoredAt": "2026-06-02",
      "notesPublished": false,
      "note": "Scores-only baseline (closed model). Generic scribe prompt, not a tuned system. n=57 PriMock57 consults, repeats=2."
    },
    {
      "system": "claude-haiku (scribe)",
      "dataset": "primock57",
      "claimLevel": "powered",
      "n": 57,
      "nErrored": 0,
      "repeats": 2,
      "narrativeMean": 67.5,
      "narrativeMeanCI": [
        65.7,
        69.32
      ],
      "dangerousFabricationRate": 0.246,
      "dangerousFabricationRateCI": [
        0.14,
        0.35
      ],
      "leakRate": 0,
      "fidelityMean": 4.22,
      "perDimension": {
        "storyCohesion": 3.99,
        "clinicalCompleteness": 2.66,
        "naturalFlow": 3.57,
        "absenceOfArtifacts": 3.88,
        "physicianReadability": 3.85,
        "inputFidelity": 4.22
      },
      "judgeModel": "opus",
      "scoredAt": "2026-06-02",
      "notesPublished": false,
      "note": "Scores-only baseline (closed model). Generic scribe prompt, not a tuned system. n=57 PriMock57 consults, repeats=2."
    },
    {
      "system": "gpt-4o (scribe)",
      "dataset": "cases",
      "claimLevel": "smoke",
      "n": 3,
      "nErrored": 0,
      "repeats": 2,
      "narrativeMean": 71.7,
      "narrativeMeanCI": [
        65,
        77
      ],
      "dangerousFabricationRate": 0,
      "dangerousFabricationRateCI": [
        0,
        0
      ],
      "leakRate": 0,
      "fidelityMean": 4.67,
      "perDimension": {
        "storyCohesion": 4,
        "clinicalCompleteness": 2.17,
        "naturalFlow": 3.5,
        "absenceOfArtifacts": 5,
        "physicianReadability": 3.83,
        "inputFidelity": 4.67
      },
      "judgeModel": "opus",
      "scoredAt": "2026-06-01",
      "notesPublished": false,
      "note": "Scores-only baseline (closed model — note text not published per data policy). Generic scribe prompt, not a tuned system."
    },
    {
      "system": "claude-sonnet (scribe)",
      "dataset": "cases",
      "claimLevel": "smoke",
      "n": 3,
      "nErrored": 0,
      "repeats": 2,
      "narrativeMean": 68,
      "narrativeMeanCI": [
        58,
        75
      ],
      "dangerousFabricationRate": 0.333,
      "dangerousFabricationRateCI": [
        0,
        1
      ],
      "leakRate": 0,
      "fidelityMean": 4.67,
      "perDimension": {
        "storyCohesion": 4,
        "clinicalCompleteness": 2,
        "naturalFlow": 3.33,
        "absenceOfArtifacts": 4.33,
        "physicianReadability": 4,
        "inputFidelity": 4.67
      },
      "judgeModel": "opus",
      "scoredAt": "2026-06-01",
      "notesPublished": false,
      "note": "Scores-only baseline (closed model). Flagged fabricating EMS transport on SYN-003 — a real fabrication catch on a frontier model."
    },
    {
      "system": "example-baseline (seeded fab)",
      "dataset": "cases",
      "claimLevel": "smoke",
      "n": 3,
      "nErrored": 0,
      "repeats": 2,
      "narrativeMean": 59,
      "narrativeMeanCI": [
        27,
        77
      ],
      "dangerousFabricationRate": 0.333,
      "dangerousFabricationRateCI": [
        0,
        1
      ],
      "leakRate": 0,
      "fidelityMean": 3.5,
      "perDimension": {
        "storyCohesion": 3.67,
        "clinicalCompleteness": 2.17,
        "naturalFlow": 3.67,
        "absenceOfArtifacts": 3.67,
        "physicianReadability": 3.5,
        "inputFidelity": 3.5
      },
      "judgeModel": "opus",
      "scoredAt": "2026-06-01",
      "notesPublished": true,
      "note": "Reference row shipped with the repo; SYN-003 carries a deliberate seeded fabrication."
    }
  ]
}
