gc-os-ai · satvshr · Sep 23, 2025 · Sep 23, 2025 · Sep 25, 2025 · Sep 26, 2025
diff --git a/pyaptamer/utils/__init__.py b/pyaptamer/utils/__init__.py
@@ -8,8 +8,12 @@
     "rna2vec",
     "pdb_to_struct",
     "struct_to_aaseq",
+    "fasta_to_aaseq",
+    "hf_to_dataset",
 ]
 
+from pyaptamer.utils._fasta_to_aaseq import fasta_to_aaseq
+from pyaptamer.utils._hf_to_dataset import hf_to_dataset
 from pyaptamer.utils._pdb_to_struct import pdb_to_struct
 from pyaptamer.utils._rna import (
     dna2rna,

diff --git a/pyaptamer/utils/_fasta_to_aaseq.py b/pyaptamer/utils/_fasta_to_aaseq.py
@@ -0,0 +1,78 @@
+__author__ = "satvshr"
+__all__ = ["fasta_to_aaseq"]
+
+import os
+from io import StringIO
+
+import pandas as pd
+from Bio import SeqIO
+
+from pyaptamer.utils._hf_to_dataset import hf_to_dataset
+
+
+def fasta_to_aaseq(fasta_path, return_df=False):
+    """
+    Extract sequences from a FASTA file, Hugging Face dataset (FASTA-as-text),
+    or URL.
+
+    Parameters
+    ----------
+    fasta_path : str
+        Input location for FASTA content. Three input types are supported,
+        tried in this order:
+
+        1. Local filesystem path (absolute or relative to current working dir).
+        2. Hugging Face dataset identifier. If the string is not an existing
+           local path, the function passes it to ``hf_to_dataset`` and expects
+           the returned dataset to contain a column named ``"text"`` where each
+           row is a FASTA line. In other words, the dataset must be a FASTA file
+           that was uploaded as a text dataset on the Hub. Other schemas are not
+           supported.
+        3. URL. If ``fasta_path`` is a direct URL pointing to a text file
+           (e.g. a FASTA file hosted on a website), it is also handled via
+           ``hf_to_dataset``. As with Hugging Face datasets, the content is
+           expected to be plain FASTA text.
+
+    return_df : bool, default=False
+        If ``False`` (default) return a Python list of sequences (one str per
+        FASTA record, in the order they appear). If ``True`` return a
+        ``pandas.DataFrame`` indexed by the FASTA record id with a single column
+        named ``"sequence"``.
+
+    Returns
+    -------
+    list[str] or pandas.DataFrame
+
+        - If ``return_df=False``: list of sequence strings.
+        - If ``return_df=True``: DataFrame with FASTA record ids as index
+          and one column ``"sequence"``.
+
+    Examples
+    --------
+
+    - Local FASTA file:
+      ``fasta_to_aaseq("my_sequences.fasta")``
+    - Hugging Face dataset identifier (dataset must contain FASTA text in a
+      column named "text"):
+      ``fasta_to_aaseq("username/fasta_dataset", return_df=True)``
+    - Direct URL to a FASTA file:
+      ``fasta_to_aaseq("https://example.org/my_sequences.fasta")``
+
+    """
+    if os.path.exists(fasta_path):
+        fasta_handle = fasta_path
+    else:
+        dataset = hf_to_dataset(fasta_path)
+        content = "\n".join(dataset[:]["text"])
+        fasta_handle = StringIO(content)
+
+    records = [
+        {"id": record.id, "sequence": str(record.seq)}
+        for record in SeqIO.parse(fasta_handle, "fasta")
+    ]
+
+    if return_df:
+        df = pd.DataFrame(records).set_index("id")
+        return df
+
+    return [r["sequence"] for r in records]
diff --git a/pyaptamer/utils/_hf_to_dataset.py b/pyaptamer/utils/_hf_to_dataset.py
@@ -0,0 +1,61 @@
+__author__ = "satvshr"
+__all__ = ["hf_to_dataset"]
+
+import os
+
+from datasets import load_dataset
+
+# File formats not natively supported by `datasets.load_dataset`
+FILE_FORMATS = ["fasta", "pdb"]
+
+
+def hf_to_dataset(path, keep_in_memory=True, **kwargs):
+    """
+    Load any Hugging Face dataset or file into a `datasets.Dataset`.
+
+    This function first attempts to load the dataset natively using
+    `datasets.load_dataset`. If the dataset format is unsupported, it falls back
+    to loading the file as plain text via the "text" dataset loader.
+
+    Parameters
+    ----------
+    path : str
+        Path or identifier for the dataset. Can be:
+
+          - A Hugging Face Hub dataset name (e.g. "imdb", "username/dataset_name").
+          - A local dataset path.
+          - A URL to a dataset file.
+
+    keep_in_memory : bool, default=True
+        Whether to keep the dataset in memory instead of writing to disk cache.
+    **kwargs : dict
+        Additional keyword arguments passed to `datasets.load_dataset`.
+
+    Returns
+    -------
+    datasets.Dataset or datasets.DatasetDict
+
+        - If the source provides multiple splits (e.g. "train", "test"),
+          a `datasets.DatasetDict` is returned.
+        - If only a single split is present, the corresponding
+          `datasets.Dataset` is returned directly (this function unwraps
+          single-split DatasetDicts automatically).
+
+    """
+    ext = os.path.splitext(str(path))[-1].lstrip(".").lower()
+
+    if ext not in FILE_FORMATS:
+        ds = load_dataset(path, keep_in_memory=keep_in_memory, **kwargs)
+    else:
+        ds = load_dataset(
+            "text",
+            data_files=path,
+            keep_in_memory=keep_in_memory,
+            **kwargs,
+        )
+
+    # Unwrap single-split DatasetDict into Dataset
+    if isinstance(ds, dict) and len(ds) == 1:
+        ds = next(iter(ds.values()))
+
+    return ds
diff --git a/pyaptamer/utils/tests/test_fasta_to_aaseq.py b/pyaptamer/utils/tests/test_fasta_to_aaseq.py
@@ -0,0 +1,18 @@
+import pandas as pd
+
+from pyaptamer.utils import fasta_to_aaseq
+
+
+def test_huggingface_url_fetch():
+    """Integration test: fetch FASTA from a Hugging Face dataset URL."""
+    url = "https://huggingface.co/datasets/gcos/HoloRBP4_round8_trimmed/resolve/main/HoloRBP4_round8_trimmed.fasta"
+
+    sequences = fasta_to_aaseq(url)
+    assert isinstance(sequences, list)
+    assert len(sequences) > 0
+    assert all(isinstance(seq, str) and seq for seq in sequences)
+
+    df = fasta_to_aaseq(url, return_df=True)
+    assert isinstance(df, pd.DataFrame)
+    assert not df.empty
+    assert all(df["sequence"].str.len() > 0)
diff --git a/pyaptamer/utils/tests/test_hf_to_dataset.py b/pyaptamer/utils/tests/test_hf_to_dataset.py
@@ -0,0 +1,20 @@
+import os
+
+from pyaptamer.utils import hf_to_dataset
+
+
+def test_hf_hub_dataset_load():
+    """Test loading a known Hugging Face Hub dataset (small)."""
+    ds = hf_to_dataset(
+        "https://huggingface.co/datasets/gcos/HoloRBP4_round8_trimmed/resolve/main/HoloRBP4_round8_trimmed.fasta"
+    )
+    assert "text" in ds.column_names
+
+
+def test_load_pdb_local_file():
+    """Test parsing a local PDB file (pfoa.pdb) from the data folder."""
+    pdb_file = os.path.join(
+        os.path.dirname(__file__), "..", "..", "datasets", "data", "pfoa.pdb"
+    )
+    ds = hf_to_dataset(pdb_file)
+    assert "text" in ds.column_names
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
     "scikit-learn>=1.3.0",
     "skorch",
     "imblearn",
+    "datasets",
 ]
 
 [project.optional-dependencies]