diff --git a/pyaptamer/pseaac/__init__.py b/pyaptamer/pseaac/__init__.py index 13b1c9be..fcac43ff 100644 --- a/pyaptamer/pseaac/__init__.py +++ b/pyaptamer/pseaac/__init__.py @@ -1,5 +1,6 @@ """The PSeAAC encoding algorithm""" -from pyaptamer.pseaac._features import PSeAAC +from pyaptamer.pseaac._pseaac_aptanet import AptaNetPSeAAC +from pyaptamer.pseaac._pseaac_general import PSeAAC -__all__ = ["PSeAAC"] +__all__ = ["PSeAAC", "AptaNetPSeAAC"] diff --git a/pyaptamer/pseaac/_props.py b/pyaptamer/pseaac/_props.py index 2509cf83..a05c4b90 100644 --- a/pyaptamer/pseaac/_props.py +++ b/pyaptamer/pseaac/_props.py @@ -5,7 +5,7 @@ import pandas as pd -def aa_props(type="numpy", normalize=True): +def aa_props(prop_indices=None, type="numpy", normalize=True): """ Amino acid physicochemical property matrix for PSeAAC. @@ -13,6 +13,7 @@ def aa_props(type="numpy", normalize=True): 20 standard amino acids. Each row corresponds to an amino acid (in the order: A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y), and each column corresponds to a property (P1–P21). The properties are in the order: + - Hydrophobicity - Hydrophilicity - Side-chain Mass @@ -37,6 +38,7 @@ def aa_props(type="numpy", normalize=True): References ---------- + - https://github.com/nedaemami/AptaNet/blob/main/feature_extraction.py - Hydrophobicity values are from JACS, 1962, 84: 4240-4246. (C. Tanford) - Hydrophilicity values are from PNAS, 1981, 78:3824-3828 (T.P.Hopp & K.R.Woods) @@ -66,6 +68,9 @@ def aa_props(type="numpy", normalize=True): Parameters ---------- + prop_indices : list of int, optional + List of indices (0-based) of properties to include (e.g., [0, 4, 7]). + If None, returns all 21 properties. type : {'numpy', 'pandas'}, default='numpy' If 'pandas', returns a DataFrame with amino acid and property labels. If 'numpy', returns a numpy array. @@ -79,10 +84,9 @@ def aa_props(type="numpy", normalize=True): Returns ------- props : numpy.ndarray or pandas.DataFrame (depending on `type`) + - Rows: standard amino acids (A, C, D, ..., Y) - - Columns: physicochemical properties (P1–P21) of the standard amino acids, as - mentioned in the original implementation: - https://github.com/nedaemami/AptaNet/blob/main/feature_extraction.py + - Columns: physicochemical properties of the standard amino acids. - Entries: raw or normalized property values depending on `normalize`. Examples @@ -1072,8 +1076,14 @@ def aa_props(type="numpy", normalize=True): ] ).T # shape (20, 21) + if prop_indices is not None: + props = props[:, prop_indices] + selected_names = [prop_names[i] for i in prop_indices] + else: + selected_names = prop_names + if type == "pandas": - return pd.DataFrame(props, index=aa_order, columns=prop_names) + return pd.DataFrame(props, index=aa_order, columns=selected_names) elif type == "numpy": return props else: diff --git a/pyaptamer/pseaac/_features.py b/pyaptamer/pseaac/_pseaac_aptanet.py similarity index 99% rename from pyaptamer/pseaac/_features.py rename to pyaptamer/pseaac/_pseaac_aptanet.py index 8a3f3f87..d98b1366 100644 --- a/pyaptamer/pseaac/_features.py +++ b/pyaptamer/pseaac/_pseaac_aptanet.py @@ -1,5 +1,5 @@ __author__ = ["nennomp", "satvshr"] -__all__ = ["PSeAAC"] +__all__ = ["AptaNetPSeAAC"] from collections import Counter @@ -9,7 +9,7 @@ from pyaptamer.utils._pseaac_utils import AMINO_ACIDS, clean_protein_seq -class PSeAAC: +class AptaNetPSeAAC: """ Compute Pseudo Amino Acid Composition (PseAAC) features for a protein sequence. @@ -25,7 +25,6 @@ class PSeAAC: Group 1 includes properties 1–3, Group 2 includes properties 4–6, and so on, up to Group 7, which includes properties 19–21. The properties in order are: - 1. Hydrophobicity 2. Hydrophilicity 3. Side-chain Mass @@ -48,10 +47,8 @@ class PSeAAC: 20. Unfolding Enthalpy Change 21. Unfolding Gibbs Free Energy Change - Each feature vector consists of: - - 20 normalized amino acid composition features (frequency of each standard amino acid) - `self.lambda_val` sequence-order correlation features based on physicochemical diff --git a/pyaptamer/pseaac/_pseaac_general.py b/pyaptamer/pseaac/_pseaac_general.py new file mode 100644 index 00000000..2029ad8a --- /dev/null +++ b/pyaptamer/pseaac/_pseaac_general.py @@ -0,0 +1,279 @@ +__author__ = ["nennomp", "satvshr"] +__all__ = ["PSeAAC"] + +from collections import Counter + +import numpy as np + +from pyaptamer.pseaac._props import aa_props +from pyaptamer.utils._pseaac_utils import AMINO_ACIDS, clean_protein_seq + + +class PSeAAC: + """ + Compute Pseudo Amino Acid Composition (PseAAC) features for a protein sequence. + + This class generates a numerical feature vector that encodes both the composition + and local order of amino acids in a protein sequence. The features are derived from + selected physicochemical properties and sequence-order correlations as described in + the PseAAC model by Chou. + + The PSeAAC algorithm uses normalized physicochemical (NP) properties of amino + acids, loaded from a predefined matrix using `aa_props`. Properties can be grouped + in one of three ways: + + - `prop_indices`: A list of property indices (0-based) to select from the 21 + available properties. If None, all 21 properties are used. + - `group_props`: If provided as an integer, the selected properties are grouped + into chunks of this size (e.g., `group_props=3` groups into sets of 3). + If None, the default is groups of size 3 (7 groups for 21 properties). + - `custom_groups`: A list of lists, where each sublist contains local column + indices into the selected property matrix. This overrides all other grouping + logic. + + The 21 physicochemical properties (columns) are: + + 0. Hydrophobicity + 1. Hydrophilicity + 2. Side-chain Mass + 3. Polarity + 4. Molecular Weight + 5. Melting Point + 6. Transfer Free Energy + 7. Buriability + 8. Bulkiness + 9. Solvation Free Energy + 10. Relative Mutability + 11. Residue Volume + 12. Volume + 13. Amino Acid Distribution + 14. Hydration Number + 15. Isoelectric Point + 16. Compressibility + 17. Chromatographic Index + 18. Unfolding Entropy Change + 19. Unfolding Enthalpy Change + 20. Unfolding Gibbs Free Energy Change + + Each feature vector consists of: + + - 20 normalized amino acid composition features (frequency of each standard + amino acid) + - `lambda_val` sequence-order correlation features (theta values) computed + from the selected physicochemical property groups. + + For each property group, the above (20 + `lambda_val`) features are computed, + resulting in a final vector of length (20 + lambda_val) * number of normalized + physiochemical (NP) property groups of amino acids (default 7). + + Parameters + ---------- + lambda_val : int, optional, default=30 + The lambda parameter defining the number of sequence-order correlation factors. + This also determines the minimum length allowed for input protein sequences, + which should be of length greater than `lambda_val`. + weight : float, optional, default=0.05 + The weight factor for the sequence-order correlation features. + prop_indices : list[int] or None, optional + Indices of properties to use (0-based). If None, all 21 properties are used. + group_props : int or None, optional + Group size for selected properties. If None, defaults to groups of 3. + custom_groups : list[list[int]] or None, optional + Explicit groupings of local property indices. Overrides `group_props`. + + Attributes + ---------- + np_matrix : np.ndarray of shape (20, n_props) + Normalized property values for the selected amino acids and properties. + prop_groups : list[list[int]] + Groupings of local property indices into `np_matrix`. + + Methods + ------- + transform(protein_sequence) + Generate the PseAAC feature vector for the given protein sequence. + + References + ---------- + Shen HB, Chou KC. PseAAC: a flexible web server for generating various kinds of + protein pseudo amino acid composition. Anal Biochem. 2008 Feb 15;373(2):386-8. + doi: 10.1016/j.ab.2007.10.012. Epub 2007 Oct 13. PMID: 17976365. + + Example + ------- + >>> from pyaptamer.pseaac import PSeAAC + >>> seq = "ACDFFKKIIKKLLMMNNPPQQQRRRRIIIIRRR" + >>> # Select only 6 properties and group into 3 groups of equal size + >>> pseaac = PSeAAC(prop_indices=[0, 1, 2, 3, 4, 5], group_props=2) + >>> # Custom grouping (4 groups) + >>> pseaac = PSeAAC(custom_groups=[[0, 1], [2, 3], [4, 5], [6, 7]]) + >>> # Default: all properties, grouped into 7 groups of 3 + >>> pseaac = PSeAAC() + >>> features = pseaac.transform("ACDEFGHIKLMNPQRHIKLMNPQRSTVWHIKLMNPQRSTVWY") + >>> print(features[:10]) + [0.006 0.006 0.006 0.006 0.006 0.006 0.018 0.018 0.018 0.018] + """ + + def __init__( + self, + lambda_val=30, + weight=0.05, + prop_indices=None, + group_props=None, + custom_groups=None, + ): + self.lambda_val = lambda_val + self.weight = weight + + if group_props is not None and custom_groups is not None: + raise ValueError( + "Specify only one of `group_props` or `custom_groups`,not both." + ) + + self.np_matrix = aa_props( + prop_indices=prop_indices, type="numpy", normalize=True + ) + self._n_cols = self.np_matrix.shape[1] # The number of properties selected + + if custom_groups: + self.prop_groups = custom_groups + elif group_props is None: + if self._n_cols % 3 != 0: + raise ValueError( + "Default grouping expects number of properties divisible by 3." + ) + self.prop_groups = [ + list(range(i, i + 3)) for i in range(0, self._n_cols, 3) + ] + else: + if self._n_cols % group_props != 0: + raise ValueError( + f"Number of properties ({self._n_cols}) must be divisible by" + f"group_props ({group_props})." + ) + self.prop_groups = [ + list(range(i, i + group_props)) + for i in range(0, self._n_cols, group_props) + ] + + def _normalized_aa(self, seq): + """ + Compute the normalized amino acid composition for a sequence. + + Parameters + ---------- + seq : str + Protein sequence. + + Returns + ------- + np.ndarray + A 1D NumPy array of length 20, where each entry corresponds to the frequency + of a standard amino acid in the input sequence. The order of amino acids is: + ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', + 'S', 'T', 'V', 'W', 'Y'] + """ + counts = Counter(seq) + total = len(seq) + return np.array([counts.get(aa, 0) / total for aa in AMINO_ACIDS]) + + def _avg_theta_val(self, seq_vec, seq_len, n, prop_group): + """ + Compute the average theta value for a sequence and property group. + + Parameters + ---------- + seq_vec : np.ndarray + Sequence converted to integer indices (shape: [seq_len]). + seq_len : int + Length of the sequence. + n : int + Offset for theta calculation. + prop_group : tuple of int + Tuple of property indices. + + Returns + ------- + float + Average theta value. + """ + props = self.np_matrix[:, prop_group] + + ri = props[seq_vec[: seq_len - n]] + rj = props[seq_vec[n:]] + + diffs = rj - ri + return np.mean(diffs**2) + + def transform(self, protein_sequence): + """ + Generate the PseAAC feature vector for the given protein sequence. + + This method computes a set of features based on amino acid composition + and sequence-order correlations using physicochemical properties, as + described in the Pseudo Amino Acid Composition (PseAAC) model. The protein + sequence should be of length greater than `self.lambda_val`. + + Parameters + ---------- + protein_sequence : str + The input protein sequence consisting of valid amino acid characters + (A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y). + lambda_val : int, default=30 + The maximum distance between residues considered in the sequence-order + correlation (θ) calculations. + weight : float, default=0.15 + The weight factor that balances the contribution of sequence-order + correlation features relative to amino acid composition features. + + Returns + ------- + np.ndarray + A 1D NumPy array of length (20 + `self.lambda_val) * number of normalized + physiochemical (NP) property groups of amino acids (default 7). + Each element consists of: + - 20 normalized amino acid composition features + - `self.lambda_val` normalized sequence-order correlation factors (theta + values) + + Raises + ------ + ValueError + If the input sequence contains invalid amino acids or is shorter than + `self.lambda_val`. + """ + seq = clean_protein_seq(protein_sequence) + seq_len = len(seq) + if seq_len <= self.lambda_val: + raise ValueError( + f"Protein sequence is too short, should be longer than `lambda_val`. " + f"Sequence length: {seq_len}, `lambda_val`: {self.lambda_val}." + ) + + aa_to_idx = {aa: i for i, aa in enumerate(AMINO_ACIDS)} + seq_vec = np.array([aa_to_idx[aa] for aa in seq], dtype=np.int32) + + aa_freq = self._normalized_aa(seq) + sum_all_aa_freq = aa_freq.sum() + + all_pseaac = [] + for prop_group in self.prop_groups: + all_theta_val = np.array( + [ + self._avg_theta_val(seq_vec, seq_len, n, prop_group) + for n in range(1, self.lambda_val + 1) + ] + ) + + sum_all_theta_val = np.sum(all_theta_val) + denominator_val = sum_all_aa_freq + (self.weight * sum_all_theta_val) + + # First 20 features: normalized amino acid composition + all_pseaac.extend(np.round(aa_freq / denominator_val, 3)) + + # Next `self.lambda_val` features: theta values + all_pseaac.extend( + np.round((self.weight * all_theta_val) / denominator_val, 3) + ) + + return np.array(all_pseaac) diff --git a/pyaptamer/pseaac/tests/test_pseaac.py b/pyaptamer/pseaac/tests/test_pseaac.py index ddfba30a..841e9948 100644 --- a/pyaptamer/pseaac/tests/test_pseaac.py +++ b/pyaptamer/pseaac/tests/test_pseaac.py @@ -3,30 +3,26 @@ import numpy as np import pytest -from pyaptamer.pseaac import PSeAAC +from pyaptamer.pseaac import AptaNetPSeAAC, PSeAAC from pyaptamer.pseaac._props import aa_props from pyaptamer.pseaac.tests._props import solution +vector = "ACDFFKKIIKKLLMMNNPPQQQRRRRIIIIRRR" + def test_normalized_values(): """ Test that normalized property matrix matches expected normalized values. - Asserts - ------- - All normalized property values match the hard-coded normalized matrix - for each amino acid and property. + This test targets the common `aa_props` normalization and therefore uses + the package-level `aa_props` implementation. """ - # Get original and normalized property matrices as DataFrames original_df = aa_props(type="pandas", normalize=False) normalized_df = aa_props(type="pandas", normalize=True) - # Manually normalize the original matrix (z-score, column-wise, - # rounded to 3 decimals) manual_norm = (original_df - original_df.mean()) / original_df.std(ddof=0) manual_norm = manual_norm.round(3) - # Compare each value for aa in original_df.index: for prop in original_df.columns: assert normalized_df.loc[aa, prop] == manual_norm.loc[aa, prop], ( @@ -35,6 +31,7 @@ def test_normalized_values(): ) +@pytest.mark.parametrize("PCLASS", [PSeAAC, AptaNetPSeAAC]) @pytest.mark.parametrize( "seq,lambda_val", [ @@ -43,12 +40,12 @@ def test_normalized_values(): ("A", 2), # length 1, lambda_val 2 ], ) -def test_pseaac_transform_sequence_too_short(seq, lambda_val): +def test_pseaac_transform_sequence_too_short(PCLASS, seq, lambda_val): """ - Test that the PSeAAC transform method raises an error for protein sequences of - length smaller or equal to lambda_val. + Both PSeAAC implementations must raise for sequences shorter than or equal + to `lambda_val`. """ - p = PSeAAC(lambda_val=lambda_val) + p = PCLASS(lambda_val=lambda_val) with pytest.raises(ValueError, match="Protein sequence is too short"): p.transform(seq) @@ -57,28 +54,18 @@ def test_pseaac_transform_sequence_too_short(seq, lambda_val): "seq,expected_vector", [ ( - "ACDFFKKIIKKLLMMNNPPQQQRRRRIIIIRRR", + vector, solution, ) ], ) def test_pseaac_vectorization(seq, expected_vector): """ - Test that the PSeAAC vectorization produces the expected feature vector. - - Parameters - ---------- - seq : str - Protein sequence to transform. - expected_vector : list of float - Expected PSeAAC feature vector. - - Asserts - ------- - The produced vector matches the expected vector in length and - values (within tolerance). + Test that the AptaNet-specific PSeAAC vectorization produces the expected + feature vector. This compares against the provided `solution` which matches + the AptaNet implementation. """ - p = PSeAAC() + p = AptaNetPSeAAC() pv = p.transform(seq) assert len(pv) == len(expected_vector), ( @@ -90,3 +77,51 @@ def test_pseaac_vectorization(seq, expected_vector): if not np.isclose(a, b, atol=1e-3) ] assert not mismatches, f"Vector values mismatch at indices: {mismatches}" + + +@pytest.mark.parametrize( + "seq,prop_indices,group_props,custom_groups,expected_len", + [ + # Test case 1: default props, group of 3 (should result in 7 groups * 50 = 350) + (vector, None, None, None, 350), + # Test case 2: select only 6 props, group into 2 (3 groups * 50 = 150) + (vector, [0, 1, 2, 3, 4, 5], 2, None, 150), + # Test case 3: custom grouping of 4 groups(4 groups * 50 = 200) + (vector, None, None, [[0, 1], [2, 3], [4, 5], [6, 7]], 200), + ], +) +def test_pseaac_configurations( + seq, + prop_indices, + group_props, + custom_groups, + expected_len, +): + """ + Test different PSeAAC configurations with various property groupings. + + Parameters + ---------- + seq : str + Protein sequence to transform. + prop_indices : list of int or None + Property indices to use (0-based). + group_props : int or None + Grouping size for automatic chunking. + custom_groups : list of list of int or None + Custom property groups. + expected_len : int + Expected length of resulting feature vector. + + Asserts + ------- + Output feature vector has the expected length. + """ + pse = PSeAAC( + prop_indices=prop_indices, group_props=group_props, custom_groups=custom_groups + ) + vec = pse.transform(seq) + + assert len(vec) == expected_len, ( + f"Expected vector length {expected_len}, but got {len(vec)}" + ) diff --git a/pyaptamer/utils/_aptanet_utils.py b/pyaptamer/utils/_aptanet_utils.py index dd2de07c..7c238520 100644 --- a/pyaptamer/utils/_aptanet_utils.py +++ b/pyaptamer/utils/_aptanet_utils.py @@ -5,7 +5,7 @@ import numpy as np -from pyaptamer.pseaac import PSeAAC +from pyaptamer.pseaac import AptaNetPSeAAC def generate_kmer_vecs(aptamer_sequence, k=4): @@ -84,7 +84,7 @@ def pairs_to_features(X, k=4): A 2D NumPy array where each row corresponds to the concatenated feature vector for a given (aptamer, protein) pair. """ - pseaac = PSeAAC() + pseaac = AptaNetPSeAAC() feats = [] for aptamer_seq, protein_seq in X: