ecmwf · JPXKQX · Apr 10, 2025 · Apr 10, 2025 · Apr 11, 2025 · Apr 23, 2025
diff --git a/dop_dataset.py b/dop_dataset.py
@@ -0,0 +1,220 @@
+import json
+import os
+import random
+from typing import Optional
+
+import numpy as np
+import torch
+import yaml
+from torch.utils.data import IterableDataset
+from torch.utils.data import get_worker_info
+
+CONFIG_YAML = """
+sources:
+  training:
+    # era5:
+    #   dataset:
+    #     dataset: aifs-ea-an-oper-0001-mars-o96-1979-2023-6h-v8
+    #     set_group: era5
+    #   # preprocessors:
+    #   #   tp:
+    #   #     - normalizer: mean-std
+    snow:
+        dataset: observations-testing-2018-2018-6h-v1-one-month
+    metop_a:
+        dataset: observations-testing-2018-2018-6h-v1-one-month
+    amsr2_h180:
+        dataset: observations-testing-2018-2018-6h-v1-one-month
+  validation:
+    todo:
+sample:
+    dictionary:
+        input:
+            dictionary:
+                ascat_metop_a:
+                    tuple:
+                      - timedelta: "-6h"
+                        variables:
+                           metop_a: ["scatss_1", "scatss_2"]
+                snow:
+                    tuple:
+                      - timedelta: "0h"
+                        variables:
+                           snow: ["sdepth_0"]
+                amsr2:
+                    tuple:
+                      - timedelta: "-6h"
+                        variables:
+                            amsr2_h180: ["rawbt_1", "rawbt_2", "rawbt_3", "rawbt_4"]
+"""
+
+CONFIG = yaml.safe_load(CONFIG_YAML)
+
+
+from anemoi.training.data.refactor.draft import sample_provider_factory
+
+
+def show_yaml(structure):
+    return yaml.dump(structure, indent=2, sort_keys=False)
+
+
+def show_json(structure):
+    return json.dumps(structure, indent=2, default=shorten_numpy)
+
+
+def shorten_numpy(structure):
+    if isinstance(structure, np.ndarray):
+        return f"np.array({structure.shape})"
+    return structure
+
+
+def get_base_seed():
+    """Get a base seed for random number generation.
+    This is a placeholder function; replace with actual logic to get a base seed.
+    """
+    return 42  # Example fixed seed, replace with actual logic as needed
+
+
+class DOPDataset(IterableDataset):
+    def __init__(
+        self,
+        # config: dict,
+        shuffle: bool = True,
+        rollout: int = 1,
+        multistep: int = 1,
+        task: str = "training",
+    ) -> None:
+
+        self.shuffle = shuffle
+        # self.config = config
+        self.rollout = rollout
+        self.multistep = multistep
+        self.task = task
+
+        # lazy init
+        self.n_samples_per_epoch_total: int = 0
+        self.n_samples_per_epoch_per_worker: int = 0
+
+        # additional state vars (lazy init)
+        self.n_samples_per_worker = 0
+        self.chunk_index_range: Optional[np.ndarray] = None
+        self.shuffle = shuffle
+        self.rng: Optional[np.random.Generator] = None
+        self.worker_id: int = -1
+
+        # "full" shuffling
+        self.data_indices: Optional[np.ndarray] = None
+
+        self.seed_comm_group_id = 0
+        self.seed_comm_num_groups = 1
+
+        training_context = {
+            "name": "training",
+            "sources": CONFIG["sources"]["training"],
+            "start": "2018-11-02",
+            "end": "2018-11-01",
+        }
+
+        self._sample_provider = sample_provider_factory(context=training_context, **CONFIG["sample"])
+        self._sample_provider = self._sample_provider.shuffle(seed=42)
+
+        # self.len = len(self._sample_provider)
+
+    def __get_sample(self, index: int):
+        """Get a sample from the dataset."""
+        return self._sample_provider[index]
+
+    def per_worker_init(self, n_workers: int, worker_id: int) -> None:
+        """Called by worker_init_func on each copy of dataset.
+
+        This initialises after the worker process has been spawned.
+
+        Parameters
+        ----------
+        n_workers : int
+            Number of workers
+        worker_id : int
+            Worker ID
+        """
+        self.worker_id = worker_id
+
+        lenght = len(self._sample_provider)
+        # Divide this equally across shards (one shard per group!)
+        shard_size = lenght // self.seed_comm_num_groups
+        shard_start = self.seed_comm_group_id * shard_size
+        shard_end = min((self.seed_comm_group_id + 1) * shard_size, lenght)
+
+        shard_len = shard_end - shard_start
+        self.n_samples_per_worker = shard_len // n_workers
+
+        low = shard_start + worker_id * self.n_samples_per_worker
+        high = min(shard_start + (worker_id + 1) * self.n_samples_per_worker, shard_end)
+        self.chunk_index_range = np.arange(low, high, dtype=np.uint32)
+
+        seed = get_base_seed()  # all workers get the same seed (so they all get the same index shuffle)
+        torch.manual_seed(seed)
+        random.seed(seed)
+        self.rng = np.random.default_rng(seed=seed)
+        sanity_rnd = self.rng.random(1)
+        print("Sanity check random number:", sanity_rnd)
+
+    def __iter__(self):
+        # no shuffle, just iterate over the chunk indices
+        for idx in self.chunk_index_range:
+            print(
+                f"VALIDATION: Worker {self.worker_id} (pid {os.getpid()}) fetching sample index {idx} ...",
+            )
+            yield self.__get_sample(idx)
+
+
+def worker_init_func(worker_id: int) -> None:
+    """Configures each dataset worker process.
+
+    Calls WeatherBenchDataset.per_worker_init() on each dataset object.
+
+    Parameters
+    ----------
+    worker_id : int
+        Worker ID
+
+    Raises
+    ------
+    RuntimeError
+        If worker_info is None
+    """
+    worker_info = get_worker_info()  # information specific to each worker process
+    if worker_info is None:
+        print("worker_info is None! Set num_workers > 0 in your dataloader!")
+        raise RuntimeError
+    dataset_obj = worker_info.dataset  # the copy of the dataset held by this worker process.
+    dataset_obj.per_worker_init(
+        n_workers=worker_info.num_workers,
+        worker_id=worker_id,
+    )
+
+
+if __name__ == "__main__":
+
+    ds = DOPDataset(
+        # CONFIG,
+        shuffle=False,
+        rollout=1,
+        multistep=1,
+        task="training",
+    )
+
+    loader_params = {
+        "batch_size": 1,  # must be 1 for the time being
+        "batch_sampler": None,
+        "num_workers": 2,
+        "pin_memory": False,
+        "worker_init_fn": worker_init_func,
+        # "collate_fn": None, # collator_wrapper(return_original_metadata=cfg_.dataloader.return_dates),
+    }
+
+    dl = torch.utils.data.DataLoader(ds, **loader_params, sampler=None)
+
+    for batch_idx, batch in enumerate(dl):
+        print("%s", batch)
+        if batch_idx >= 1:
+            break
diff --git a/models/src/anemoi/models/interface/__init__.py b/models/src/anemoi/models/interface/__init__.py
@@ -19,10 +19,20 @@
 from anemoi.models.distributed.graph import shard_tensor
 from anemoi.models.distributed.shapes import apply_shard_shapes
 from anemoi.models.distributed.shapes import get_shard_shapes
+from anemoi.models.models.mult_encoder_processor_decoder import AnemoiMultiModel
 from anemoi.models.preprocessing import Processors
 from anemoi.utils.config import DotDict
 
 
+def processor_factory(name_to_index, statistics, processors, **kwargs) -> list[list]:
+    from anemoi.models.preprocessing.normalizer import InputNormalizer
+
+    return [
+        [name, instantiate(cfg, name_to_index=name_to_index["variables"], statistics=statistics["variables"])]
+        for name, cfg in processors.items()
+    ]
+
+
 class AnemoiModelInterface(torch.nn.Module):
     """An interface for Anemoi models.
 
@@ -59,46 +69,39 @@ def __init__(
         self,
         *,
         config: DotDict,
+        sample_provider,
         graph_data: HeteroData,
-        statistics: dict,
-        data_indices: dict,
+        # data_indices: dict,
         metadata: dict,
-        supporting_arrays: dict = None,
-        truncation_data: dict,
     ) -> None:
         super().__init__()
         self.config = config
         self.id = str(uuid.uuid4())
-        self.multi_step = self.config.training.multistep_input
+        self.sample_provider = sample_provider
         self.graph_data = graph_data
-        self.statistics = statistics
-        self.truncation_data = truncation_data
         self.metadata = metadata
-        self.supporting_arrays = supporting_arrays if supporting_arrays is not None else {}
-        self.data_indices = data_indices
+        self.supporting_arrays = {}
         self._build_model()
 
     def _build_model(self) -> None:
         """Builds the model and pre- and post-processors."""
         # Instantiate processors
-        processors = [
-            [name, instantiate(processor, data_indices=self.data_indices, statistics=self.statistics)]
-            for name, processor in self.config.data.processors.items()
-        ]
+        preprocessors = self.sample_provider.apply(processor_factory)
 
         # Assign the processor list pre- and post-processors
-        self.pre_processors = Processors(processors)
-        self.post_processors = Processors(processors, inverse=True)
+        self.input_pre_processors = Processors(preprocessors["input"].processor_factory)
+        self.target_pre_processors = Processors(preprocessors["target"].processor_factory)
+        self.target_post_processors = Processors(preprocessors["target"].processor_factory, inverse=True)
+        # TODO: Implemente structure.processor_factory (not only at LeafStructure)
 
         # Instantiate the model
-        self.model = instantiate(
-            self.config.model.model,
+        self.model = AnemoiMultiModel(
+            # self.config.model.model,
             model_config=self.config,
-            data_indices=self.data_indices,
-            statistics=self.statistics,
+            sample_provider=self.sample_provider,
             graph_data=self.graph_data,
-            truncation_data=self.truncation_data,
-            _recursive_=False,  # Disables recursive instantiation by Hydra
+            # truncation_data=self.truncation_data,
+            # _recursive_=False,  # Disables recursive instantiation by Hydra
         )
 
         # Use the forward method of the model directly