add decorator for custom cache

aditya0by0 · aditya0by0 · commit ed28dfed9f30 · 2025-08-01T21:35:28.000+02:00
diff --git a/chebifier/ensemble/_custom_cache.py b/chebifier/ensemble/_custom_cache.py
@@ -2,7 +2,9 @@
 import pickle
 import threading
 from collections import OrderedDict
-from typing import Any
+from collections.abc import Iterable
+from functools import wraps
+from typing import Any, Callable
 
 
 class PerSmilesPerModelLRUCache:
@@ -30,6 +32,7 @@ def get(self, smiles: str, model_name: str) -> Any | None:
                 return None
 
     def set(self, smiles: str, model_name: str, value: Any) -> None:
+        assert value is not None, "Value must not be None"
         key = (smiles, model_name)
         with self._lock:
             if key in self._cache:
@@ -50,9 +53,65 @@ def clear(self) -> None:
     def stats(self) -> dict:
         return {"hits": self.hits, "misses": self.misses}
 
+    def batch_decorator(self, func: Callable) -> Callable:
+        """Decorator for class methods that accept a batch of SMILES as a tuple,
+        and want caching per (smiles, model_name) combination.
+        """
+
+        @wraps(func)
+        def wrapper(instance, smiles_list: list[str]):
+            assert isinstance(smiles_list, list), "smiles_list must be a list."
+            model_name = getattr(instance, "model_name", None)
+            assert model_name is not None, "Instance must have a model_name attribute."
+
+            results = []
+            missing_smiles = []
+            missing_indices = []
+
+            # First: try to fetch all from cache
+            for i, smiles in enumerate(smiles_list):
+                result = self.get(smiles=smiles, model_name=model_name)
+                if result is not None:
+                    results.append((i, result))  # save index for reordering
+                else:
+                    missing_smiles.append(smiles)
+                    missing_indices.append(i)
+
+            # If some are missing, call original function
+            if missing_smiles:
+                new_results = func(instance, tuple(missing_smiles))
+                assert isinstance(
+                    new_results, Iterable
+                ), "Function must return an  Iterable."
+                # Save to cache and append
+                for smiles, prediction in zip(missing_smiles, new_results):
+                    if prediction is not None:
+                        self.set(smiles, model_name, prediction)
+                    results.append((missing_indices.pop(0), prediction))
+
+            # Reorder results to match original indices
+            results.sort(key=lambda x: x[0])  # sort by index
+            ordered = [result for _, result in results]
+            return ordered
+
+        return wrapper
+
+    def __len__(self):
+        with self._lock:
+            return len(self._cache)
+
+    def __repr__(self):
+        return self._cache.__repr__()
+
+    def save(self):
+        self._save_cache()
+
+    def load(self):
+        self._load_cache()
+
     def _save_cache(self) -> None:
         """Serialize the cache to disk."""
-        if not self._persist_path:
+        if self._persist_path:
             try:
                 with open(self._persist_path, "wb") as f:
                     pickle.dump(self._cache, f)
@@ -72,5 +131,40 @@ def _load_cache(self) -> None:
 
 
 if __name__ == "__main__":
-    # Example usage
-    cache = PerSmilesPerModelLRUCache(max_size=100, persist_path="cache.pkl")
+    # cache will persist across runs in "cache.pkl"
+    cache = PerSmilesPerModelLRUCache(max_size=50)
+
+    class ExamplePredictor:
+        model_name = "example_model"
+
+        @cache.batch_decorator
+        def predict(self, smiles_list: tuple[str]) -> list[dict]:
+            # Simulate a prediction function
+            return [{"prediction": hash(smiles) % 100} for smiles in smiles_list]
+
+    # Create an instance of the predictor
+    predictor = ExamplePredictor()
+
+    # Prediction set 1 — new model, all should be cache misses
+    predictor.model_name = "example_model"
+    predictor.predict(["CCC", "C", "CCO", "CCN"])  # MISS × 4
+    print("Cache Stats:", cache.stats())
+
+    # Prediction set 2 — same model, partial hit/miss
+    predictor.model_name = "example_model"
+    predictor.predict(["CCC", "CO", "CCO", "CN"])  # HIT: CCC, CCO — MISS: CO, CN
+    print("Cache Stats:", cache.stats())
+
+    # Prediction set 3 — new model, same SMILES — should all be misses (per-model caching)
+    predictor.model_name = "example_model_2"
+    predictor.predict(["CCC", "C", "CO", "CN"])  # MISS × 4 (new model)
+    print("Cache Stats:", cache.stats())
+
+    # Prediction set 4 — another model
+    predictor.model_name = "example_model_3"
+    predictor.predict(["CCCC", "CCCl", "CCBr", "C(C)C"])  # MISS × 4
+    print("Cache Stats:", cache.stats())
+
+    from pprint import pprint
+
+    pprint(cache)