From 62bced689cd803b61e3dda030924d8dcb72a8fa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 11 Nov 2024 15:40:38 +0100 Subject: [PATCH 01/14] fix cf decoding of grid_mapping --- xarray/conventions.py | 47 ++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 133dbf00063..490fe359aef 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -31,6 +31,7 @@ "formula_terms", ) CF_RELATED_DATA_NEEDS_PARSING = ( + "grid_mapping", "cell_measures", "formula_terms", ) @@ -477,17 +478,39 @@ def stackable(dim: Hashable) -> bool: for attr_name in CF_RELATED_DATA: if attr_name in var_attrs: attr_val = var_attrs[attr_name] - if attr_name not in CF_RELATED_DATA_NEEDS_PARSING: - var_names = attr_val.split() - else: - roles_and_names = [ - role_or_name - for part in attr_val.split(":") - for role_or_name in part.split() - ] - if len(roles_and_names) % 2 == 1: - emit_user_level_warning(f"Attribute {attr_name} malformed") - var_names = roles_and_names[1::2] + # fixes stray colon + # todo: add test + var_names = attr_val.replace(" :", ":").split() + # if grid_mapping is a single string, do not enter here + if ( + attr_name in CF_RELATED_DATA_NEEDS_PARSING + and len(var_names) > 1 + ): + # map the keys to list of strings + roles_and_names = defaultdict(list) + key = None + for v in var_names: + if ":" in v: + key = v.strip(":") + else: + # todo: test this + if key is None: + raise ValueError( + f"First element {v} of {attr_name} misses ':'" + ) + roles_and_names[key].append(v) + # for grid_mapping keys are var_names + if attr_name == "grid_mapping": + var_names = list(roles_and_names.keys()) + else: + # for cell_measures and formula_terms values are var names + var_names = [lst[0] for lst in roles_and_names.values()] + # consistency check + # todo: test this + if len(var_names) != len(roles_and_names.keys()): + emit_user_level_warning( + f"Attribute {attr_name} malformed" + ) if all(var_name in variables for var_name in var_names): new_vars[k].encoding[attr_name] = attr_val coord_names.update(var_names) @@ -732,7 +755,7 @@ def _encode_coordinates( # the dataset faithfully. Because this serialization goes beyond CF # conventions, only do it if necessary. # Reference discussion: - # http://mailman.cgd.ucar.edu/pipermail/cf-metadata/2014/007571.html + # https://cfconventions.org/mailing-list-archive/Data/7400.html global_coordinates.difference_update(written_coords) if global_coordinates: attributes = dict(attributes) From d6476d0fa543ce2e70de6ffc784c954c49f90190 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 11 Nov 2024 15:47:59 +0100 Subject: [PATCH 02/14] fix linter --- xarray/conventions.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index c46bdcd9337..b52585db07d 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -489,16 +489,16 @@ def stackable(dim: Hashable) -> bool: # map the keys to list of strings roles_and_names = defaultdict(list) key = None - for v in var_names: - if ":" in v: - key = v.strip(":") + for vname in var_names: + if ":" in vname: + key = vname.strip(":") else: # todo: test this if key is None: raise ValueError( - f"First element {v} of {attr_name} misses ':'" + f"First element {vname} of {attr_name} misses ':'" ) - roles_and_names[key].append(v) + roles_and_names[key].append(vname) # for grid_mapping keys are var_names if attr_name == "grid_mapping": var_names = list(roles_and_names.keys()) From 5982d78fb98c16640f81bf655de2a1f2d5c2bdfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 12 Nov 2024 08:39:53 +0100 Subject: [PATCH 03/14] unnest list, add tests --- xarray/conventions.py | 19 ++++++++++--------- xarray/tests/test_backends.py | 24 ++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index b52585db07d..02b24a4ff50 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -477,10 +477,9 @@ def stackable(dim: Hashable) -> bool: if decode_coords == "all": for attr_name in CF_RELATED_DATA: if attr_name in var_attrs: - attr_val = var_attrs[attr_name] # fixes stray colon - # todo: add test - var_names = attr_val.replace(" :", ":").split() + attr_val = var_attrs[attr_name].replace(" :", ":") + var_names = attr_val.split() # if grid_mapping is a single string, do not enter here if ( attr_name in CF_RELATED_DATA_NEEDS_PARSING @@ -493,10 +492,10 @@ def stackable(dim: Hashable) -> bool: if ":" in vname: key = vname.strip(":") else: - # todo: test this if key is None: raise ValueError( - f"First element {vname} of {attr_name} misses ':'" + f"First element {vname!r} of [{attr_val!r}] misses ':', " + f"cannot decode {attr_name!r}." ) roles_and_names[key].append(vname) # for grid_mapping keys are var_names @@ -504,12 +503,14 @@ def stackable(dim: Hashable) -> bool: var_names = list(roles_and_names.keys()) else: # for cell_measures and formula_terms values are var names - var_names = [lst[0] for lst in roles_and_names.values()] - # consistency check - # todo: test this + var_names = [ + e for lst in roles_and_names.values() for e in lst + ] + # consistency check (one element per key) if len(var_names) != len(roles_and_names.keys()): emit_user_level_warning( - f"Attribute {attr_name} malformed" + f"Attribute {attr_name!r} has malformed content [{attr_val!r}], " + f"decoding {var_names!r} to coordinates." ) if all(var_name in variables for var_name in var_names): new_vars[k].encoding[attr_name] = attr_val diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7be6eb5ed0d..9b6daf5d9a0 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1138,6 +1138,9 @@ def test_coordinate_variables_after_dataset_roundtrip(self) -> None: original = self._create_cf_dataset() with self.roundtrip(original, open_kwargs={"decode_coords": "all"}) as actual: assert_identical(actual, original) + # test for fixing stray colon + assert original["ln_p"].encoding["formula_terms"] == "p0: P0 lev : ln_p" + assert actual["ln_p"].encoding["formula_terms"] == "p0: P0 lev: ln_p" with self.roundtrip(original) as actual: expected = original.reset_coords( @@ -1148,6 +1151,27 @@ def test_coordinate_variables_after_dataset_roundtrip(self) -> None: # identical would require resetting a number of attributes # skip that. assert_equal(actual, expected) + # test for not fixing stray colon + assert original["ln_p"].encoding["formula_terms"] == "p0: P0 lev : ln_p" + assert actual["ln_p"].attrs["formula_terms"] == "p0: P0 lev : ln_p" + + def test_coordinate_decoding_warnings_and_errors(self) -> None: + original = self._create_cf_dataset() + original.coords["ln_p"].encoding.update({"formula_terms": "p0 P0 lev: ln_p"}) + with pytest.raises(ValueError, match="misses ':'"): + with self.roundtrip( + original, open_kwargs={"decode_coords": "all"} + ) as actual: + assert isinstance(actual, xr.Dataset) + + original.coords["ln_p"].encoding.update( + {"formula_terms": "p0: P0 P1 lev: ln_p"} + ) + with pytest.warns(UserWarning, match="has malformed content"): + with self.roundtrip( + original, open_kwargs={"decode_coords": "all"} + ) as actual: + assert isinstance(actual, xr.Dataset) def test_grid_mapping_and_bounds_are_coordinates_after_dataarray_roundtrip( self, From dddb6bc15203ac02a53fdfa81885a36823ba4526 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 12 Nov 2024 08:48:09 +0100 Subject: [PATCH 04/14] add whats-new.rst entry --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4659978df8a..a38575e183d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -53,6 +53,8 @@ Bug fixes By `Stephan Hoyer `_. - Fix regression in the interoperability of :py:meth:`DataArray.polyfit` and :py:meth:`xr.polyval` for date-time coordinates. (:pull:`9691`). By `Pascal Bourgault `_. +- Fix CF decoding of ``grid_mapping`` to allow all possible formats, add tests (:issue:`9761`, :pull:`9765`). + By `Kai Mühlbauer `_. Documentation ~~~~~~~~~~~~~ From bcbe5f37245dfe4fee897eaacf8096804f548f7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 12 Nov 2024 09:23:35 +0100 Subject: [PATCH 05/14] check for second warning, copy to prevent windows error (?) --- xarray/tests/test_backends.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 9b6daf5d9a0..5783e11ea53 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1156,7 +1156,7 @@ def test_coordinate_variables_after_dataset_roundtrip(self) -> None: assert actual["ln_p"].attrs["formula_terms"] == "p0: P0 lev : ln_p" def test_coordinate_decoding_warnings_and_errors(self) -> None: - original = self._create_cf_dataset() + original = self._create_cf_dataset().copy() original.coords["ln_p"].encoding.update({"formula_terms": "p0 P0 lev: ln_p"}) with pytest.raises(ValueError, match="misses ':'"): with self.roundtrip( @@ -1168,10 +1168,11 @@ def test_coordinate_decoding_warnings_and_errors(self) -> None: {"formula_terms": "p0: P0 P1 lev: ln_p"} ) with pytest.warns(UserWarning, match="has malformed content"): - with self.roundtrip( - original, open_kwargs={"decode_coords": "all"} - ) as actual: - assert isinstance(actual, xr.Dataset) + with pytest.warns(UserWarning, match="not in variables:"): + with self.roundtrip( + original, open_kwargs={"decode_coords": "all"} + ) as actual: + assert isinstance(actual, xr.Dataset) def test_grid_mapping_and_bounds_are_coordinates_after_dataarray_roundtrip( self, From 44989a4bda6f6186746fb25b86446e3fed440604 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 12 Nov 2024 09:50:50 +0100 Subject: [PATCH 06/14] revert copy, but set allow_cleanup_failures=ON_WINDOWS --- xarray/tests/test_backends.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5783e11ea53..786aad19927 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1156,11 +1156,13 @@ def test_coordinate_variables_after_dataset_roundtrip(self) -> None: assert actual["ln_p"].attrs["formula_terms"] == "p0: P0 lev : ln_p" def test_coordinate_decoding_warnings_and_errors(self) -> None: - original = self._create_cf_dataset().copy() + original = self._create_cf_dataset() original.coords["ln_p"].encoding.update({"formula_terms": "p0 P0 lev: ln_p"}) with pytest.raises(ValueError, match="misses ':'"): with self.roundtrip( - original, open_kwargs={"decode_coords": "all"} + original, + open_kwargs={"decode_coords": "all"}, + allow_cleanup_failure=ON_WINDOWS, ) as actual: assert isinstance(actual, xr.Dataset) From 88d3a77ab24c0f8f4ee9141377e86c1494716604 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 13 Nov 2024 17:53:23 +0100 Subject: [PATCH 07/14] add itertools --- xarray/conventions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/conventions.py b/xarray/conventions.py index 02b24a4ff50..fb8e8c90751 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -2,6 +2,7 @@ from collections import defaultdict from collections.abc import Hashable, Iterable, Mapping, MutableMapping +import itertools from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union import numpy as np From 6e872c18d906db8be144ec059d1a000e7f289011 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 13 Nov 2024 16:53:40 +0000 Subject: [PATCH 08/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/conventions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index fb8e8c90751..02b24a4ff50 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -2,7 +2,6 @@ from collections import defaultdict from collections.abc import Hashable, Iterable, Mapping, MutableMapping -import itertools from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union import numpy as np From 205d8e2857fb118fe7ba70df17720856d0562fa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 13 Nov 2024 17:53:54 +0100 Subject: [PATCH 09/14] Update xarray/conventions.py Co-authored-by: Deepak Cherian --- xarray/conventions.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 02b24a4ff50..f8295df193d 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -503,9 +503,7 @@ def stackable(dim: Hashable) -> bool: var_names = list(roles_and_names.keys()) else: # for cell_measures and formula_terms values are var names - var_names = [ - e for lst in roles_and_names.values() for e in lst - ] + var_names = list(itertools.chain(*roles_and_names.values())) # consistency check (one element per key) if len(var_names) != len(roles_and_names.keys()): emit_user_level_warning( From 47c60384ff9b331ce4946c39c637005b3d957323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 13 Nov 2024 17:57:16 +0100 Subject: [PATCH 10/14] Update conventions.py --- xarray/conventions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/conventions.py b/xarray/conventions.py index f8295df193d..0624e7de1a6 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -2,6 +2,7 @@ from collections import defaultdict from collections.abc import Hashable, Iterable, Mapping, MutableMapping +import itertools from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union import numpy as np From 7b8ed8e4f9f19e873284c7fa70e774c42d9cbe9e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 13 Nov 2024 16:58:26 +0000 Subject: [PATCH 11/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/conventions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 0624e7de1a6..94c7cff8f98 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -1,8 +1,8 @@ from __future__ import annotations +import itertools from collections import defaultdict from collections.abc import Hashable, Iterable, Mapping, MutableMapping -import itertools from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union import numpy as np From be0abfa12d32df428930df5f2fcd37ce770bfb88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 13 Nov 2024 20:26:55 +0100 Subject: [PATCH 12/14] add test in test_conventions.py --- xarray/tests/test_conventions.py | 90 ++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index e6c69fc1ee1..39950b4f9b8 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -294,6 +294,96 @@ def test_decode_coordinates(self) -> None: actual = conventions.decode_cf(original) assert actual.foo.encoding["coordinates"] == "x" + def test_decode_coordinates_with_key_values(self) -> None: + # regression test for GH9761 + original = Dataset( + { + "temp": ( + ("y", "x"), + np.random.rand(2, 2), + { + "long_name": "temperature", + "units": "K", + "coordinates": "lat lon", + "grid_mapping": "crs", + }, + ), + "x": ( + ("x"), + np.arange(2), + {"standard_name": "projection_x_coordinate", "units": "m"}, + ), + "y": ( + ("y"), + np.arange(2), + {"standard_name": "projection_y_coordinate", "units": "m"}, + ), + "lat": ( + ("y", "x"), + np.random.rand(2, 2), + {"standard_name": "latitude", "units": "degrees_north"}, + ), + "lon": ( + ("y", "x"), + np.random.rand(2, 2), + {"standard_name": "longitude", "units": "degrees_east"}, + ), + "crs": ( + (), + None, + { + "grid_mapping_name": "transverse_mercator", + "longitude_of_central_meridian": -2.0, + }, + ), + "crs2": ( + (), + None, + { + "grid_mapping_name": "longitude_latitude", + "longitude_of_central_meridian": -2.0, + }, + ), + }, + ) + + original.temp.attrs["grid_mapping"] = "crs: x y" + vars, attrs, coords = conventions.decode_cf_variables( + original.variables, {}, decode_coords="all" + ) + assert coords == {"lat", "lon", "crs"} + + original.temp.attrs["grid_mapping"] = "crs: x y crs2: lat lon" + vars, attrs, coords = conventions.decode_cf_variables( + original.variables, {}, decode_coords="all" + ) + assert coords == {"lat", "lon", "crs", "crs2"} + + # stray colon + original.temp.attrs["grid_mapping"] = "crs: x y crs2 : lat lon" + vars, attrs, coords = conventions.decode_cf_variables( + original.variables, {}, decode_coords="all" + ) + assert coords == {"lat", "lon", "crs", "crs2"} + + original.temp.attrs["grid_mapping"] = "crs x y crs2: lat lon" + with pytest.raises(ValueError, match="misses ':'"): + conventions.decode_cf_variables(original.variables, {}, decode_coords="all") + + del original.temp.attrs["grid_mapping"] + original.temp.attrs["formula_terms"] = "A: lat D: lon E: crs2" + vars, attrs, coords = conventions.decode_cf_variables( + original.variables, {}, decode_coords="all" + ) + assert coords == {"lat", "lon", "crs2"} + + original.temp.attrs["formula_terms"] = "A: lat lon D: crs E: crs2" + with pytest.warns(UserWarning, match="has malformed content"): + vars, attrs, coords = conventions.decode_cf_variables( + original.variables, {}, decode_coords="all" + ) + assert coords == {"lat", "lon", "crs", "crs2"} + def test_0d_int32_encoding(self) -> None: original = Variable((), np.int32(0), encoding={"dtype": "int64"}) expected = Variable((), np.int64(0)) From b03c943c7638cc7091d7eff03594b585d393978a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 13 Nov 2024 20:30:52 +0100 Subject: [PATCH 13/14] add comment --- xarray/conventions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/conventions.py b/xarray/conventions.py index 94c7cff8f98..e4e71a481e8 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -487,6 +487,8 @@ def stackable(dim: Hashable) -> bool: and len(var_names) > 1 ): # map the keys to list of strings + # "A: b c d E: f g" returns + # {"A": ["b", "c", "d"], "E": ["f", "g"]} roles_and_names = defaultdict(list) key = None for vname in var_names: From d67b1c1b4d2ab1743a3f31e75f8a6fbeb107324e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 13 Nov 2024 23:05:59 +0100 Subject: [PATCH 14/14] revert backend tests --- xarray/tests/test_backends.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 786aad19927..7be6eb5ed0d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1138,9 +1138,6 @@ def test_coordinate_variables_after_dataset_roundtrip(self) -> None: original = self._create_cf_dataset() with self.roundtrip(original, open_kwargs={"decode_coords": "all"}) as actual: assert_identical(actual, original) - # test for fixing stray colon - assert original["ln_p"].encoding["formula_terms"] == "p0: P0 lev : ln_p" - assert actual["ln_p"].encoding["formula_terms"] == "p0: P0 lev: ln_p" with self.roundtrip(original) as actual: expected = original.reset_coords( @@ -1151,30 +1148,6 @@ def test_coordinate_variables_after_dataset_roundtrip(self) -> None: # identical would require resetting a number of attributes # skip that. assert_equal(actual, expected) - # test for not fixing stray colon - assert original["ln_p"].encoding["formula_terms"] == "p0: P0 lev : ln_p" - assert actual["ln_p"].attrs["formula_terms"] == "p0: P0 lev : ln_p" - - def test_coordinate_decoding_warnings_and_errors(self) -> None: - original = self._create_cf_dataset() - original.coords["ln_p"].encoding.update({"formula_terms": "p0 P0 lev: ln_p"}) - with pytest.raises(ValueError, match="misses ':'"): - with self.roundtrip( - original, - open_kwargs={"decode_coords": "all"}, - allow_cleanup_failure=ON_WINDOWS, - ) as actual: - assert isinstance(actual, xr.Dataset) - - original.coords["ln_p"].encoding.update( - {"formula_terms": "p0: P0 P1 lev: ln_p"} - ) - with pytest.warns(UserWarning, match="has malformed content"): - with pytest.warns(UserWarning, match="not in variables:"): - with self.roundtrip( - original, open_kwargs={"decode_coords": "all"} - ) as actual: - assert isinstance(actual, xr.Dataset) def test_grid_mapping_and_bounds_are_coordinates_after_dataarray_roundtrip( self,