diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 46445d2..f14237e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,7 +40,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install package - run: python -m pip install .[test] + run: python -m pip install .[test,schema] - name: Test run: python -m pytest -ra diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8847338..8c6796d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,7 +39,6 @@ repos: - importlib_metadata - importlib_resources - - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.10.0 hooks: @@ -72,3 +71,9 @@ repos: hooks: - id: check-readthedocs - id: check-github-workflows + - id: check-metaschema + files: ^src/uhi/resources/histogram.json$ + - id: check-jsonschema + name: Validate Histogram examples + args: [--schemafile, src/uhi/resources/histogram.json] + files: ^tests/resources/.*\.json diff --git a/docs/conf.py b/docs/conf.py index 9c5a56a..6fd87d6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -32,6 +32,7 @@ # ones. extensions = [ "myst_parser", + "sphinx-jsonschema", "sphinx.ext.napoleon", "sphinx_copybutton", "sphinx_github_changelog", diff --git a/docs/index.rst b/docs/index.rst index 55cc258..4a9dfd6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -30,6 +30,7 @@ to plot a histogram, including error bars. indexing.rst indexing+.rst plotting.rst + serialization.md changelog.md diff --git a/docs/serialization.md b/docs/serialization.md new file mode 100644 index 0000000..cd2bddf --- /dev/null +++ b/docs/serialization.md @@ -0,0 +1,118 @@ +# Serialization + + +## Introduction + +Histogram serialization has to cover a wide range of formats. As such, we +describe a form for serialization that covers the metadata structure as +JSON-like, with a provided JSON-schema. The data (bins and/or variable edges) +is stored out-of-band in a binary format based on what type of data file you +are in. For very small (primarily 1D) histograms, data is allowed inline as +well. + +The following formats are being targeted: + +``` +┌────────┐ ┌────────┐ ┌───────┐ +│ ROOT │ │ HDF5 │ │ ZIP │ +└────────┘ └────────┘ └───────┘ +``` + +Other formats can be used as well, assuming they support out-of-band data and +text attributes or files for the metadata. + +## Caveats + +This structure was based heavily on boost-histogram, but it is intended to be +general, and can be expanded in the future as needed. As such, the following +limitations are required: + +* Serialization followed by deserialisation may cause axis changes. Axis types + may change to an equivalent but less performant axis, growth status will be + lost, etc. +* Metadata must be expressible as JSON. It should also be reasonably sized; some + formats like HDF5 may limit the size of attributes to 64K. +* Floating point errors could be incurred on conversion, as the storage format + uses a stable but different representation. +* Axis `name` is only part of the metadata, and is not standardized. This is + due to lack of support from boost-histogram. + +## Design + +The following axes types are supported: + +* `"regular"`: A regularly spaced set of even bins. Boost-histogram's "integer" + axes maps to this axis as well. Has `upper`, `lower`, `bins`, `underflow`, + `overflow`, and `circular` properties. `circular` defaults to False if not + present. +* `"variable"`: A continuous axis defined by bins+1 edges. Has `edges`, which + is either an in-line list of numbers or a string pointing to an out-of-band data source. + Also has `underflow`, `overflow`, and `circular` properties. `circular` + defaults to False if not present. +* `"category_int"`: A list of integer bins, non-continuous. Has `categories`, + which is an in-line list of integers. Also has `flow`. +* `"category_str"`: A list of string bins. Has `categories`, + which is an in-line list of strings. Also has `flow`. +* `"boolean"`: A true/false axis. + +Axes with gaps are currently not supported. + +All axes support `metadata`, a string-valued dictionary of arbitrary, JSON-like data. + +The following storages are supported: + +* `"int"`: A collection of integers. Boost-histogram's Int64 and AtomicInt64 + map to this, and sometimes Unlimited. +* `"double"`: A collection of 64-bit floating point values. Boost-histogram's + Double storage maps to this, and sometimes Unlimited. +* `"weighted"`: A collection of two arrays of 64-bit floating point values, + `"value"` and `"variance"`. Boost-histogram's Weight storage maps to this. +* `"mean"`: A collection of three arrays of 64-bit floating point values, + "count", "value", and "variance". Boost-histogram's Mean storage maps to + this. +* `"weighted_mean"`: A collection of four arrays of 64-bit floating point + values, `"sum_of_weights"`, `"sum_of_weights_squared"`, `"values"`, and + `"variances"`. Boost-histogram's WeighedMean storage maps to this. + +## CLI/API + +You can currently test a JSON file against the schema by running: + +```console +$ python -m uhi.schema some/file.json +``` + +Or with code: + +```python +import uhi.schema + +uhi.schema.validate("some/file.json") +``` + +Eventually this should also be usable for JSON's inside zip, HDF5 attributes, +and maybe more. + +```{warning} + +Currently, this spec describes **how to prepare the metadata** for one of the +targeted backends. It does not yet cover backend specific details, like how to +define and use the binary resource locator strings or how to store the data. +JSON is not a target spec, but just part of the ZIP spec, meaning the files +that currently "pass" the tool above would be valid inside a `.zip` file +eventually, but are not valid by themselves. +``` + +## Rendered schema + +```{jsonschema} ../src/uhi/resources/histogram.json +``` + + +## Full schema + +The full schema is below: + +```{literalinclude} ../src/uhi/resources/histogram.json +:language: json +``` diff --git a/noxfile.py b/noxfile.py index 0f6205f..2a576a9 100644 --- a/noxfile.py +++ b/noxfile.py @@ -28,7 +28,7 @@ def tests(session): """ Run the unit and regular tests. """ - session.install("-e.[test]") + session.install("-e.[test,schema]") session.run("pytest", *session.posargs) diff --git a/pyproject.toml b/pyproject.toml index 1ba50a0..28173b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,10 +44,15 @@ Documentation = "https://uhi.readthedocs.io/en/latest/" Changelog = "https://github.com/scikit-hep/uhi/releases" [project.optional-dependencies] +schema = [ + "fastjsonschema", + "importlib-resources; python_version<'3.9'", +] docs = [ "sphinx>=4.0", "furo", "sphinx-copybutton>=0.3.1", + "sphinx-jsonschema", "myst-parser", "sphinx_github_changelog", ] @@ -88,6 +93,10 @@ show_error_codes = true warn_unreachable = true enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"] +[[tool.mypy.overrides]] +module = ["fastjsonschema"] +ignore_missing_imports = true + [tool.ruff] select = [ "E", "F", "W", # flake8 diff --git a/src/uhi/resources/histogram.json b/src/uhi/resources/histogram.json new file mode 100644 index 0000000..e62ecbb --- /dev/null +++ b/src/uhi/resources/histogram.json @@ -0,0 +1,290 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://raw.githubusercontent.com/scikit-hep/uhi/henryiii/feat/schema/src/uhi/resources/histogram.json", + "title": "Histogram", + "type": "object", + "additionalProperties": false, + "patternProperties": { + ".+": { + "type": "object", + "required": ["axes", "storage"], + "additionalProperties": false, + "properties": { + "metadata": { + "type": "object", + "description": "Arbitrary metadata dictionary." + }, + "axes": { + "type": "array", + "description": "A list of the axes of the histogram.", + "items": { + "oneOf": [ + { "$ref": "#/$defs/regular_axis" }, + { "$ref": "#/$defs/variable_axis" }, + { "$ref": "#/$defs/category_str_axis" }, + { "$ref": "#/$defs/category_int_axis" }, + { "$ref": "#/$defs/boolean_axis" } + ] + } + }, + "storage": { + "description": "The storage of the bins of the histogram.", + "oneOf": [ + { "$ref": "#/$defs/int_storage" }, + { "$ref": "#/$defs/double_storage" }, + { "$ref": "#/$defs/weighted_storage" }, + { "$ref": "#/$defs/mean_storage" }, + { "$ref": "#/$defs/weighted_mean_storage" } + ] + } + } + } + }, + "$defs": { + "regular_axis": { + "type": "object", + "description": "An evenly spaced set of continuous bins.", + "required": ["type", "lower", "upper", "bins", "underflow", "overflow"], + "additionalProperties": false, + "properties": { + "type": { "type": "string", "const": "regular" }, + "lower": { "type": "number", "description": "Lower edge of the axis." }, + "upper": { "type": "number", "description": "Upper edge of the axis." }, + "bins": { + "type": "integer", + "minimum": 0, + "description": "Number of bins in the axis." + }, + "underflow": { + "type": "boolean", + "description": "True if there is a bin for underflow." + }, + "overflow": { + "type": "boolean", + "description": "True if there is a bin for overflow." + }, + "circular": { + "type": "boolean", + "description": "True if the axis wraps around." + }, + "metadata": { + "type": "object", + "description": "Arbitrary metadata dictionary." + } + } + }, + "variable_axis": { + "type": "object", + "description": "A variably spaced set of continuous bins.", + "required": ["type", "edges", "underflow", "overflow"], + "additionalProperties": false, + "properties": { + "type": { "type": "string", "const": "variable" }, + "edges": { + "oneOf": [ + { + "type": "array", + "items": { "type": "number", "minItems": 2, "uniqueItems": true } + }, + { + "type": "string", + "description": "A path (URI?) to the edges data." + } + ] + }, + "underflow": { "type": "boolean" }, + "overflow": { "type": "boolean" }, + "circular": { "type": "boolean" }, + "metadata": { + "type": "object", + "description": "Arbitrary metadata dictionary." + } + } + }, + "category_str_axis": { + "type": "object", + "description": "A set of string categorical bins.", + "required": ["type", "categories", "flow"], + "additionalProperties": false, + "properties": { + "type": { "type": "string", "const": "category_str" }, + "categories": { + "type": "array", + "items": { "type": "string" }, + "uniqueItems": true + }, + "flow": { + "type": "boolean", + "description": "True if flow bin (at the overflow position) present." + }, + "metadata": { + "type": "object", + "description": "Arbitrary metadata dictionary." + } + } + }, + "category_int_axis": { + "type": "object", + "description": "A set of integer categorical bins in any order.", + "required": ["type", "categories", "flow"], + "additionalProperties": false, + "properties": { + "type": { "type": "string", "const": "category_int" }, + "categories": { + "type": "array", + "items": { "type": "integer" }, + "uniqueItems": true + }, + "flow": { + "type": "boolean", + "description": "True if flow bin (at the overflow position) present." + }, + "metadata": { + "type": "object", + "description": "Arbitrary metadata dictionary." + } + } + }, + "boolean_axis": { + "type": "object", + "description": "A simple true/false axis with no flow.", + "required": ["type"], + "additionalProperties": false, + "properties": { + "type": { "type": "string", "const": "boolean" }, + "metadata": { + "type": "object", + "description": "Arbitrary metadata dictionary." + } + } + }, + "int_storage": { + "type": "object", + "description": "A storage holding integer counts.", + "required": ["type", "data"], + "additionalProperties": false, + "properties": { + "type": { "type": "string", "const": "int" }, + "data": { + "oneOf": [ + { + "type": "string", + "description": "A path (URI?) to the integer bin data." + }, + { "type": "array", "items": { "type": "integer" } } + ] + } + } + }, + "double_storage": { + "type": "object", + "description": "A storage holding floating point counts.", + "required": ["type", "data"], + "additionalProperties": false, + "properties": { + "type": { "type": "string", "const": "double" }, + "data": { + "oneOf": [ + { + "type": "string", + "description": "A path (URI?) to the floating point bin data." + }, + { "type": "array", "items": { "type": "number" } } + ] + } + } + }, + "weighted_storage": { + "type": "object", + "description": "A storage holding floating point counts and variances.", + "required": ["type", "data"], + "additionalProperties": false, + "properties": { + "type": { "type": "string", "const": "int" }, + "data": { + "oneOf": [ + { + "type": "string", + "description": "A path (URI?) to the floating point bin data; outer dimension is [value, variance]" + }, + { + "type": "object", + "required": ["values", "variances"], + "additionalProperties": false, + "properties": { + "values": { "type": "array", "items": { "type": "number" } }, + "variances": { "type": "array", "items": { "type": "number" } } + } + } + ] + } + } + }, + "mean_storage": { + "type": "object", + "description": "A storage holding 'profile'-style floating point counts, values, and variances.", + "required": ["type", "data"], + "additionalProperties": false, + "properties": { + "type": { "type": "string", "const": "int" }, + "data": { + "oneOf": [ + { + "type": "string", + "description": "A path (URI?) to the floating point bin data; outer dimension is [counts, value, variance]" + }, + { + "type": "object", + "required": ["counts", "values", "variances"], + "additionalProperties": false, + "properties": { + "counts": { "type": "array", "items": { "type": "number" } }, + "values": { "type": "array", "items": { "type": "number" } }, + "variances": { "type": "array", "items": { "type": "number" } } + } + } + ] + } + } + }, + "weighted_mean_storage": { + "type": "object", + "description": "A storage holding 'profile'-style floating point ∑weights, ∑weights², values, and variances.", + "required": ["type", "data"], + "additionalProperties": false, + "properties": { + "type": { "type": "string", "const": "int" }, + "data": { + "oneOf": [ + { + "type": "string", + "description": "A path (URI?) to the floating point bin data; outer dimension is [∑weights, ∑weights², value, variance]" + }, + { + "type": "object", + "required": [ + "sum_of_weights", + "sum_of_weights_squared", + "values", + "variances" + ], + "additionalProperties": false, + "properties": { + "sum_of_weights": { + "type": "array", + "items": { "type": "number" } + }, + "sum_of_weights_squared": { + "type": "array", + "items": { "type": "number" } + }, + "values": { "type": "array", "items": { "type": "number" } }, + "variances": { "type": "array", "items": { "type": "number" } } + } + } + ] + } + } + } + } +} diff --git a/src/uhi/schema.py b/src/uhi/schema.py new file mode 100644 index 0000000..3d1662e --- /dev/null +++ b/src/uhi/schema.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +import json +import sys +from pathlib import Path + +import fastjsonschema + +if sys.version_info < (3, 9): + import importlib_resources as resources +else: + from importlib import resources + +histogram_file = resources.files("uhi") / "resources/histogram.json" + +with histogram_file.open(encoding="utf-8") as f: + histogram_schema = fastjsonschema.compile(json.load(f)) + + +def validate(path: str | Path) -> None: + path = Path(path) + with path.open(encoding="utf-8") as f: + example = json.load(f) + + histogram_schema(example) + + +if __name__ == "__main__": + validate(*sys.argv[1:]) diff --git a/tests/resources/reg.json b/tests/resources/reg.json new file mode 100644 index 0000000..0e7643e --- /dev/null +++ b/tests/resources/reg.json @@ -0,0 +1,30 @@ +{ + "one": { + "metadata": {}, + "axes": [ + { + "type": "regular", + "lower": 0, + "upper": 5, + "bins": 3, + "underflow": true, + "overflow": true, + "circular": false + } + ], + "storage": { "type": "int", "data": [1, 2, 3, 4, 5] } + }, + "two": { + "axes": [ + { + "type": "regular", + "lower": 0, + "upper": 5, + "bins": 5, + "underflow": true, + "overflow": true + } + ], + "storage": { "type": "double", "data": "some/path/depends/on/format" } + } +} diff --git a/tests/test_histogram_schema.py b/tests/test_histogram_schema.py new file mode 100644 index 0000000..4f02c4d --- /dev/null +++ b/tests/test_histogram_schema.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from pathlib import Path + +import uhi.schema + +DIR = Path(__file__).parent.resolve() + + +def test_example_1() -> None: + uhi.schema.validate(DIR / "resources/reg.json")