aeon-toolkit · TonyBagnall · Oct 30, 2023 · Sep 5, 2023 · Sep 5, 2023 · Sep 26, 2023
diff --git a/aeon/registry/_base_classes.py b/aeon/registry/_base_classes.py
@@ -43,6 +43,7 @@
 from aeon.networks.base import BaseDeepNetwork
 from aeon.performance_metrics.base import BaseMetric
 from aeon.regression.base import BaseRegressor
+from aeon.similarity_search.base import BaseSimiliaritySearch
 from aeon.transformations.base import BaseTransformer
 from aeon.transformations.collection import BaseCollectionTransformer
 
@@ -64,6 +65,7 @@
         BaseCollectionTransformer,
         "time series collection transformer",
     ),
+    ("similarity-search", BaseSimiliaritySearch, "similarity search"),
 ]
 
 

diff --git a/aeon/registry/tests/test_lookup.py b/aeon/registry/tests/test_lookup.py
@@ -22,6 +22,7 @@
     "network",
     "collection-transformer",
     "collection-estimator",
+    "similarity-search",
 ]
 
 # shorthands for easy reading

@@ -0,0 +1,7 @@
+# -*- coding: utf-8 -*-
+"""BaseSimilaritySearch."""
+
+__author__ = ["baraline"]
+__all__ = ["TopKSimilaritySearch"]
+
+from aeon.similarity_search.top_k_similarity import TopKSimilaritySearch
@@ -0,0 +1,164 @@
+# -*- coding: utf-8 -*-
+"""BaseSimilaritySearch."""
+
+__author__ = ["baraline"]
+
+from abc import ABC, abstractmethod
+
+import numpy as np
+
+from aeon.base import BaseEstimator
+from aeon.similarity_search.distance_profiles import (
+    naive_euclidean_profile,
+    normalized_naive_euclidean_profile,
+)
+from aeon.utils.numba.general import sliding_mean_std_one_series
+
+
+class BaseSimiliaritySearch(BaseEstimator, ABC):
+    """BaseSimilaritySearch.
+
+    Attributes
+    ----------
+    distance : str, default ="euclidean"
+        Name of the distance function to use.
+    normalize : bool, default = False
+        Whether the distance function should be z-normalized.
+    store_distance_profile : bool, default = =False.
+        Whether to store the computed distance profile in the attribute
+        "_distance_profile" after calling the predict method.
+    """
+
+    _tags = {
+        "capability:multivariate": True,
+        "capability:missing_values": False,
+    }
+
+    def __init__(
+        self, distance="euclidean", normalize=False, store_distance_profile=False
+    ):
+        self.distance = distance
+        self.normalize = normalize
+        self.store_distance_profile = store_distance_profile
+        super(BaseSimiliaritySearch, self).__init__()
+
+    def _get_distance_profile_function(self):
+        dist_profile = DISTANCE_PROFILE_DICT.get(self.distance)
+        if dist_profile is None:
+            raise ValueError(f"Unknown distrance profile function {dist_profile}")
+        return dist_profile[self.normalize]
+
+    def _store_mean_std_from_inputs(self, Q_length):
+        n_samples, n_channels, X_length = self._X.shape
+        search_space_size = n_samples * (X_length - Q_length + 1)
+
+        means = np.zeros((n_samples, n_channels, search_space_size))
+        stds = np.zeros((n_samples, n_channels, search_space_size))
+
+        for i in range(n_samples):
+            _mean, _std = sliding_mean_std_one_series(self._X[i], Q_length, 1)
+            stds[i] = _std
+            means[i] = _mean
+
+        self._X_means = means
+        self._X_stds = stds
+
+    def fit(self, X, y=None):
+        """
+        Fit method: store the input data and get the distance profile function.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_channels, n_timestamps)
+            Input array to used as database for the similarity search
+        y : TYPE, optional
+            Not used.
+
+        Raises
+        ------
+        TypeError
+            If the input X array is not 3D raise an error.
+
+        Returns
+        -------
+        TYPE
+            DESCRIPTION.
+
+        """
+        # For now force (n_samples, n_channels, n_timestamps), we could convert 2D
+        #  (n_channels, n_timestamps) to 3D with a warning
+        if not isinstance(X, np.ndarray) or X.ndim != 3:
+            raise TypeError(
+                "Error, only supports 3D numpy of shape"
+                "(n_samples, n_channels, n_timestamps)."
+            )
+
+        # Get distance function
+        self.distance_profile_function = self._get_distance_profile_function()
+
+        self._X = X
+        self._fit(X, y)
+        return self
+
+    def predict(self, Q):
+        """
+        Predict method: Check the shape of Q and call _predict to perform the search.
+
+        If the distance profile function is normalized, it stores the mean and stds
+        from Q and _X.
+
+        Parameters
+        ----------
+        Q :  array, shape (n_channels, q_length)
+            Input query used for similarity search.
+
+        Raises
+        ------
+        TypeError
+            If the input Q array is not 2D raise an error.
+
+        Returns
+        -------
+        array
+            An array containing the indexes of the matches between Q and _X.
+            The decision of wheter a candidate of size q_length from _X is matched with
+            Q depends on the subclasses that implent the _predict method
+            (e.g. top-k, threshold, ...).
+
+        """
+        if not isinstance(Q, np.ndarray) or Q.ndim != 2:
+            raise TypeError(
+                "Error, only supports 2D numpy atm. If Q is univariate"
+                " do Q.reshape(1,-1)."
+            )
+
+        if Q.shape[-1] >= self._X.shape[-1]:
+            raise TypeError("Error, Q must be shorter than X.")
+
+        if self.normalize:
+            self._Q_mean = np.mean(Q, axis=-1)
+            self._Q_std = np.std(Q, axis=-1)
+            self._store_mean_std_from_inputs(Q.shape[-1])
+
+        return self._predict(Q)
+
+    @abstractmethod
+    def _fit(self, X, y):
+        ...
+
+    @abstractmethod
+    def _predict(self, X):
+        ...
+
+
+"""
+Dictionary structure :
+    1st lvl key : distance function used
+        2nd lvl key : boolean indicating wheter distance is normalized
+"""
+DISTANCE_PROFILE_DICT = {
+    "euclidean": {
+        True: normalized_naive_euclidean_profile,
+        False: naive_euclidean_profile,
+    }
+}
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+"""Distance profiles."""
+
+__author__ = ["baraline"]
+__all__ = ["naive_euclidean_profile", "normalized_naive_euclidean_profile"]
+
+from aeon.similarity_search.distance_profiles.naive_euclidean import (
+    naive_euclidean_profile,
+)
+from aeon.similarity_search.distance_profiles.normalized_naive_euclidean import (
+    normalized_naive_euclidean_profile,
+)
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+"""Helper and common function for similarity search distance profiles."""
+
+
+from numba import njit
+
+AEON_SIMSEARCH_STD_THRESHOLD = 1e-7
+INF = 1e12
+
+
+@njit(cache=True)
+def _get_input_sizes(X, Q):
+    """
+    Get sizes of the input and search space for similarity search.
+
+    Parameters
+    ----------
+    X : array, shape (n_samples, n_channels, series_length)
+         The input samples.
+    Q : array, shape (n_channels, series_length)
+        The input query
+
+    Returns
+    -------
+    n_samples : int
+        Number of samples in X.
+    n_channels : int
+        Number of channeks in X.
+    X_length : int
+        Number of timestamps in X.
+    q_length : int
+        Number of timestamps in Q
+    search_space_size : int
+        Size of the search space for similarity search for each sample in X
+
+    """
+    n_samples, n_channels, X_length = X.shape
+    q_length = Q.shape[-1]
+    search_space_size = X_length - q_length + 1
+    return (n_samples, n_channels, X_length, q_length, search_space_size)
+
+
+@njit(fastmath=True, cache=True)
+def _z_normalize_2D_series_with_mean_std(X, mean, std, copy=True):
+    """
+    Z-normalize a 2D series given the mean and std of each channel.
+
+    Parameters
+    ----------
+    X : array, shape = (n_channels, n_timestamps)
+        Input array to normalize.
+    mean : array, shape = (n_channels)
+        Mean of each channel.
+    std : array, shape = (n_channels)
+        Std of each channel.
+
+    Returns
+    -------
+    X : array, shape = (n_channels, n_timestamps)
+        The normalized array
+    """
+    if copy:
+        X = X.copy()
+    for i_channel in range(X.shape[0]):
+        X[i_channel] = (X[i_channel] - mean[i_channel]) / std[i_channel]
+    return X
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+"""Naive Euclidean distance profile."""
+
+__author__ = ["baraline"]
+
+import numpy as np
+from numba import njit
+
+from aeon.distances import euclidean_distance
+from aeon.similarity_search.distance_profiles._commons import INF, _get_input_sizes
+
+
+def naive_euclidean_profile(X, Q):
+    """
+    Compute a euclidean distance profile in a brute force way.
+
+    It computes the distance profiles between the input time series and the query using
+    the euclidean distance. The search is made in a brute force way without any
+    optimizations and can thus be slow.
+
+    A distance profile between a (univariate) time series X_i = {x_1, ..., x_m}
+    and a query Q = {q_1, ..., q_m} is defined as a vector of size $m-(l-1)$,
+    such as P(X_i, Q) = {d(C_1, Q), ..., d(C_m-(l-1), Q)} with d the euclidean distance,
+    and C_j = {x_j, ..., x_{j+(l-1)}} the j-th candidate subsequence of size l in X_i.
+
+    Parameters
+    ----------
+    X: array shape (n_instances, n_channels, series_length)
+        The input samples.
+
+    Q : np.ndarray shape (n_channels, query_length)
+        The query used for similarity search.
+
+    Returns
+    -------
+    distance_profile : np.ndarray shape (n_instances, series_length - query_length + 1)
+        The distance profile between Q and the input time series X.
+
+    """
+    return _naive_euclidean_profile(X, Q)
+
+
+@njit(cache=True, fastmath=True)
+def _naive_euclidean_profile(X, Q):
+    n_samples, n_channels, X_length, Q_length, search_space_size = _get_input_sizes(
+        X, Q
+    )
+    distance_profile = np.full((n_samples, search_space_size), INF)
+
+    # Compute euclidean distance for all candidate in a "brute force" way
+    for i_sample in range(n_samples):
+        for i_candidate in range(search_space_size):
+            distance_profile[i_sample, i_candidate] = euclidean_distance(
+                Q, X[i_sample, :, i_candidate : i_candidate + Q_length]
+            )
+
+    return distance_profile