Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
ba32f9d
base similarity search
TonyBagnall Sep 5, 2023
1ad9eca
slow search example
TonyBagnall Sep 5, 2023
e798d68
Merge branch 'main' of https://github.com/aeon-toolkit/aeon into expe…
MatthewMiddlehurst Sep 26, 2023
f5fc6b7
[ENH] Similarity search base class and TopK search with naïve Euclide…
baraline Sep 26, 2023
b6491be
Merge branch 'main' into experimental/similarity_search
TonyBagnall Sep 26, 2023
5ed662f
Merge branch 'experimental/similarity_search' of https://github.com/a…
TonyBagnall Sep 26, 2023
015dc9f
Merge branch 'main' into experimental/similarity_search
TonyBagnall Sep 26, 2023
c258de7
format
TonyBagnall Sep 26, 2023
1141032
add init
TonyBagnall Sep 26, 2023
ac04c74
Merge branch 'main' into experimental/similarity_search
TonyBagnall Sep 26, 2023
ae4b49a
call constructor
TonyBagnall Sep 26, 2023
98f95db
Merge branch 'main' into experimental/similarity_search
TonyBagnall Oct 2, 2023
a64e29b
add similarity base to register
TonyBagnall Oct 2, 2023
1a1858b
add similarity-search to tagging
TonyBagnall Oct 5, 2023
33557bc
Bugfixes for constant case and input alteration during normalization
baraline Oct 5, 2023
ba70e87
Merge branch 'main' into experimental/similarity_search
TonyBagnall Oct 7, 2023
cf72421
typo
TonyBagnall Oct 7, 2023
fbda755
[pre-commit.ci lite] apply automatic fixes
pre-commit-ci-lite[bot] Oct 7, 2023
f440c3e
typo
TonyBagnall Oct 7, 2023
2ea98a6
Merge branch 'main' into experimental/similarity_search
TonyBagnall Oct 7, 2023
1f74035
Merge branch 'experimental/similarity_search' of https://github.com/a…
TonyBagnall Oct 7, 2023
9fcd4d3
docstrings
TonyBagnall Oct 7, 2023
55ebc86
docstrings
TonyBagnall Oct 7, 2023
bc75368
docstrings
TonyBagnall Oct 7, 2023
7398eac
docstrings
TonyBagnall Oct 7, 2023
c7d927f
docstrings
TonyBagnall Oct 7, 2023
c5b4a33
Fixing typos
baraline Oct 8, 2023
6923f5d
Merge branch 'experimental/similarity_search' of https://github.com/a…
baraline Oct 8, 2023
bc51503
Adding some docs, adding base class arguments to topk, more expressiv…
baraline Oct 8, 2023
6bbe528
Change notation of query from Q to q
baraline Oct 8, 2023
7b40df9
Merge branch 'main' into experimental/similarity_search
TonyBagnall Oct 13, 2023
9fe08b5
Merge branch 'main' into experimental/similarity_search
TonyBagnall Oct 16, 2023
ed746d5
Adding example notebook and module img, updating docs and correcting …
baraline Oct 16, 2023
c11fd32
Merge branch 'experimental/similarity_search' of https://github.com/a…
baraline Oct 16, 2023
810de65
Adding parameters for self matches, typos in example notebook
baraline Oct 20, 2023
23f29cf
typo in import, replace Q with q
baraline Oct 20, 2023
787fe10
switch test example for pipeline
TonyBagnall Oct 21, 2023
174fff5
switch test example for pipeline
TonyBagnall Oct 21, 2023
10f79f2
Merge branch 'main' of https://github.com/aeon-toolkit/aeon
TonyBagnall Oct 21, 2023
2c66919
Add mask to distance profile, move exclusion zoneto base class, some …
baraline Oct 21, 2023
a63c21c
Merge branch 'main' into experimental/similarity_search
TonyBagnall Oct 22, 2023
b310735
Add distance profile and speedups notebooks, exclusion factor value c…
baraline Oct 22, 2023
e4d4b3e
Merge branch 'main' of https://github.com/aeon-toolkit/aeon
TonyBagnall Oct 22, 2023
bd75ab3
Merge branch 'main' of https://github.com/aeon-toolkit/aeon
TonyBagnall Oct 22, 2023
dc4d82c
Merge branch 'main' of https://github.com/aeon-toolkit/aeon
TonyBagnall Oct 23, 2023
8d4a3bd
Merge branch 'main' into experimental/similarity_search
baraline Oct 23, 2023
e0c82fd
Fixing tests and docs that where not updated after previous changes
baraline Oct 23, 2023
a625f9f
Force float convertion of input to avoid issues with normalization of…
baraline Oct 23, 2023
610ac00
Merge branch 'main' into experimental/similarity_search
TonyBagnall Oct 24, 2023
60027c5
Merge branch 'main' of https://github.com/aeon-toolkit/aeon
TonyBagnall Oct 25, 2023
b0a38ba
Merge branch 'main' into experimental/similarity_search
TonyBagnall Oct 25, 2023
b6d322f
Adding dummy class and test, correcting some docstrings
baraline Oct 27, 2023
6771ead
Fixes from Matthew review
baraline Oct 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions aeon/registry/_base_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from aeon.networks.base import BaseDeepNetwork
from aeon.performance_metrics.base import BaseMetric
from aeon.regression.base import BaseRegressor
from aeon.similarity_search.base import BaseSimiliaritySearch
from aeon.transformations.base import BaseTransformer
from aeon.transformations.collection import BaseCollectionTransformer

Expand All @@ -64,6 +65,7 @@
BaseCollectionTransformer,
"time series collection transformer",
),
("similarity-search", BaseSimiliaritySearch, "similarity search"),
]


Expand Down
1 change: 1 addition & 0 deletions aeon/registry/tests/test_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"network",
"collection-transformer",
"collection-estimator",
"similarity-search",
]

# shorthands for easy reading
Expand Down
7 changes: 7 additions & 0 deletions aeon/similarity_search/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# -*- coding: utf-8 -*-
"""BaseSimilaritySearch."""

__author__ = ["baraline"]
__all__ = ["TopKSimilaritySearch"]

from aeon.similarity_search.top_k_similarity import TopKSimilaritySearch
164 changes: 164 additions & 0 deletions aeon/similarity_search/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
"""BaseSimilaritySearch."""

__author__ = ["baraline"]

from abc import ABC, abstractmethod

import numpy as np

from aeon.base import BaseEstimator
from aeon.similarity_search.distance_profiles import (
naive_euclidean_profile,
normalized_naive_euclidean_profile,
)
from aeon.utils.numba.general import sliding_mean_std_one_series


class BaseSimiliaritySearch(BaseEstimator, ABC):
"""BaseSimilaritySearch.

Attributes
----------
distance : str, default ="euclidean"
Name of the distance function to use.
normalize : bool, default = False
Whether the distance function should be z-normalized.
store_distance_profile : bool, default = =False.
Whether to store the computed distance profile in the attribute
"_distance_profile" after calling the predict method.
"""

_tags = {
"capability:multivariate": True,
"capability:missing_values": False,
}

def __init__(
self, distance="euclidean", normalize=False, store_distance_profile=False
):
self.distance = distance
self.normalize = normalize
self.store_distance_profile = store_distance_profile
super(BaseSimiliaritySearch, self).__init__()

def _get_distance_profile_function(self):
dist_profile = DISTANCE_PROFILE_DICT.get(self.distance)
if dist_profile is None:
raise ValueError(f"Unknown distrance profile function {dist_profile}")
return dist_profile[self.normalize]

def _store_mean_std_from_inputs(self, Q_length):
n_samples, n_channels, X_length = self._X.shape
search_space_size = n_samples * (X_length - Q_length + 1)

means = np.zeros((n_samples, n_channels, search_space_size))
stds = np.zeros((n_samples, n_channels, search_space_size))

for i in range(n_samples):
_mean, _std = sliding_mean_std_one_series(self._X[i], Q_length, 1)
stds[i] = _std
means[i] = _mean

self._X_means = means
self._X_stds = stds

def fit(self, X, y=None):
"""
Fit method: store the input data and get the distance profile function.

Parameters
----------
X : array, shape (n_samples, n_channels, n_timestamps)
Input array to used as database for the similarity search
y : TYPE, optional
Not used.

Raises
------
TypeError
If the input X array is not 3D raise an error.

Returns
-------
TYPE
DESCRIPTION.

"""
# For now force (n_samples, n_channels, n_timestamps), we could convert 2D
# (n_channels, n_timestamps) to 3D with a warning
if not isinstance(X, np.ndarray) or X.ndim != 3:
raise TypeError(
"Error, only supports 3D numpy of shape"
"(n_samples, n_channels, n_timestamps)."
)

# Get distance function
self.distance_profile_function = self._get_distance_profile_function()

self._X = X
self._fit(X, y)
return self

def predict(self, Q):
"""
Predict method: Check the shape of Q and call _predict to perform the search.

If the distance profile function is normalized, it stores the mean and stds
from Q and _X.

Parameters
----------
Q : array, shape (n_channels, q_length)
Input query used for similarity search.

Raises
------
TypeError
If the input Q array is not 2D raise an error.

Returns
-------
array
An array containing the indexes of the matches between Q and _X.
The decision of wheter a candidate of size q_length from _X is matched with
Q depends on the subclasses that implent the _predict method
(e.g. top-k, threshold, ...).

"""
if not isinstance(Q, np.ndarray) or Q.ndim != 2:
raise TypeError(
"Error, only supports 2D numpy atm. If Q is univariate"
" do Q.reshape(1,-1)."
)

if Q.shape[-1] >= self._X.shape[-1]:
raise TypeError("Error, Q must be shorter than X.")

if self.normalize:
self._Q_mean = np.mean(Q, axis=-1)
self._Q_std = np.std(Q, axis=-1)
self._store_mean_std_from_inputs(Q.shape[-1])

return self._predict(Q)

@abstractmethod
def _fit(self, X, y):
...

@abstractmethod
def _predict(self, X):
...


"""
Dictionary structure :
1st lvl key : distance function used
2nd lvl key : boolean indicating wheter distance is normalized
"""
DISTANCE_PROFILE_DICT = {
"euclidean": {
True: normalized_naive_euclidean_profile,
False: naive_euclidean_profile,
}
}
12 changes: 12 additions & 0 deletions aeon/similarity_search/distance_profiles/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-
"""Distance profiles."""

__author__ = ["baraline"]
__all__ = ["naive_euclidean_profile", "normalized_naive_euclidean_profile"]

from aeon.similarity_search.distance_profiles.naive_euclidean import (
naive_euclidean_profile,
)
from aeon.similarity_search.distance_profiles.normalized_naive_euclidean import (
normalized_naive_euclidean_profile,
)
66 changes: 66 additions & 0 deletions aeon/similarity_search/distance_profiles/_commons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-
"""Helper and common function for similarity search distance profiles."""


from numba import njit

AEON_SIMSEARCH_STD_THRESHOLD = 1e-7
INF = 1e12


@njit(cache=True)
def _get_input_sizes(X, Q):
"""
Get sizes of the input and search space for similarity search.

Parameters
----------
X : array, shape (n_samples, n_channels, series_length)
The input samples.
Q : array, shape (n_channels, series_length)
The input query

Returns
-------
n_samples : int
Number of samples in X.
n_channels : int
Number of channeks in X.
X_length : int
Number of timestamps in X.
q_length : int
Number of timestamps in Q
search_space_size : int
Size of the search space for similarity search for each sample in X

"""
n_samples, n_channels, X_length = X.shape
q_length = Q.shape[-1]
search_space_size = X_length - q_length + 1
return (n_samples, n_channels, X_length, q_length, search_space_size)


@njit(fastmath=True, cache=True)
def _z_normalize_2D_series_with_mean_std(X, mean, std, copy=True):
"""
Z-normalize a 2D series given the mean and std of each channel.

Parameters
----------
X : array, shape = (n_channels, n_timestamps)
Input array to normalize.
mean : array, shape = (n_channels)
Mean of each channel.
std : array, shape = (n_channels)
Std of each channel.

Returns
-------
X : array, shape = (n_channels, n_timestamps)
The normalized array
"""
if copy:
X = X.copy()
for i_channel in range(X.shape[0]):
X[i_channel] = (X[i_channel] - mean[i_channel]) / std[i_channel]
return X
57 changes: 57 additions & 0 deletions aeon/similarity_search/distance_profiles/naive_euclidean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# -*- coding: utf-8 -*-
"""Naive Euclidean distance profile."""

__author__ = ["baraline"]

import numpy as np
from numba import njit

from aeon.distances import euclidean_distance
from aeon.similarity_search.distance_profiles._commons import INF, _get_input_sizes


def naive_euclidean_profile(X, Q):
"""
Compute a euclidean distance profile in a brute force way.

It computes the distance profiles between the input time series and the query using
the euclidean distance. The search is made in a brute force way without any
optimizations and can thus be slow.

A distance profile between a (univariate) time series X_i = {x_1, ..., x_m}
and a query Q = {q_1, ..., q_m} is defined as a vector of size $m-(l-1)$,
such as P(X_i, Q) = {d(C_1, Q), ..., d(C_m-(l-1), Q)} with d the euclidean distance,
and C_j = {x_j, ..., x_{j+(l-1)}} the j-th candidate subsequence of size l in X_i.

Parameters
----------
X: array shape (n_instances, n_channels, series_length)
The input samples.

Q : np.ndarray shape (n_channels, query_length)
The query used for similarity search.

Returns
-------
distance_profile : np.ndarray shape (n_instances, series_length - query_length + 1)
The distance profile between Q and the input time series X.

"""
return _naive_euclidean_profile(X, Q)


@njit(cache=True, fastmath=True)
def _naive_euclidean_profile(X, Q):
n_samples, n_channels, X_length, Q_length, search_space_size = _get_input_sizes(
X, Q
)
distance_profile = np.full((n_samples, search_space_size), INF)

# Compute euclidean distance for all candidate in a "brute force" way
for i_sample in range(n_samples):
for i_candidate in range(search_space_size):
distance_profile[i_sample, i_candidate] = euclidean_distance(
Q, X[i_sample, :, i_candidate : i_candidate + Q_length]
)

return distance_profile
Loading