Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pyaptamer/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
"rna2vec",
"pdb_to_struct",
"struct_to_aaseq",
"fasta_to_aaseq",
"hf_to_dataset",
]

from pyaptamer.utils._fasta_to_aaseq import fasta_to_aaseq
from pyaptamer.utils._hf_to_dataset import hf_to_dataset
from pyaptamer.utils._pdb_to_struct import pdb_to_struct
from pyaptamer.utils._rna import (
dna2rna,
Expand Down
78 changes: 78 additions & 0 deletions pyaptamer/utils/_fasta_to_aaseq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
__author__ = "satvshr"
__all__ = ["fasta_to_aaseq"]

import os
from io import StringIO

import pandas as pd
from Bio import SeqIO

from pyaptamer.utils._hf_to_dataset import hf_to_dataset


def fasta_to_aaseq(fasta_path, return_df=False):
"""
Extract sequences from a FASTA file, Hugging Face dataset (FASTA-as-text),
or URL.

Parameters
----------
fasta_path : str
Input location for FASTA content. Three input types are supported,
tried in this order:

1. Local filesystem path (absolute or relative to current working dir).
2. Hugging Face dataset identifier. If the string is not an existing
local path, the function passes it to ``hf_to_dataset`` and expects
the returned dataset to contain a column named ``"text"`` where each
row is a FASTA line. In other words, the dataset must be a FASTA file
that was uploaded as a text dataset on the Hub. Other schemas are not
supported.
3. URL. If ``fasta_path`` is a direct URL pointing to a text file
(e.g. a FASTA file hosted on a website), it is also handled via
``hf_to_dataset``. As with Hugging Face datasets, the content is
expected to be plain FASTA text.

return_df : bool, default=False
If ``False`` (default) return a Python list of sequences (one str per
FASTA record, in the order they appear). If ``True`` return a
``pandas.DataFrame`` indexed by the FASTA record id with a single column
named ``"sequence"``.

Returns
-------
list[str] or pandas.DataFrame

- If ``return_df=False``: list of sequence strings.
- If ``return_df=True``: DataFrame with FASTA record ids as index
and one column ``"sequence"``.

Examples
--------

- Local FASTA file:
``fasta_to_aaseq("my_sequences.fasta")``
- Hugging Face dataset identifier (dataset must contain FASTA text in a
column named "text"):
``fasta_to_aaseq("username/fasta_dataset", return_df=True)``
- Direct URL to a FASTA file:
``fasta_to_aaseq("https://example.org/my_sequences.fasta")``

"""
if os.path.exists(fasta_path):
fasta_handle = fasta_path
else:
dataset = hf_to_dataset(fasta_path)
content = "\n".join(dataset[:]["text"])
fasta_handle = StringIO(content)

records = [
{"id": record.id, "sequence": str(record.seq)}
for record in SeqIO.parse(fasta_handle, "fasta")
]

if return_df:
df = pd.DataFrame(records).set_index("id")
return df

return [r["sequence"] for r in records]
61 changes: 61 additions & 0 deletions pyaptamer/utils/_hf_to_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
__author__ = "satvshr"
__all__ = ["hf_to_dataset"]

import os

from datasets import load_dataset

# File formats not natively supported by `datasets.load_dataset`
FILE_FORMATS = ["fasta", "pdb"]


def hf_to_dataset(path, keep_in_memory=True, **kwargs):
Copy link
Contributor

@fkiraly fkiraly Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this function feels unnecessary, it simply aliases a simple call of huggingface load_dataset.

Why do we need this? I would simply remove it.

Maybe you could explain?

Side remark: The try/expect structure looks problematic. Generally, we should avoid using try/except as an if/else, instead we should use if/else with the correct condition.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there not a condition that we can replace this in, in an if/else?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there was a defined variable which hugging face used to list all their recognisable file formats I would say yes, but as of now I dont think it is possible.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still doubt the reasoning behind having this function.

Can you please give two examples where we currently need this, one example for each branch of the code?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

try block: We want to load a file from hugging face which is in one of the known file formats supported by hugging face, so something like the imbd dataset, which is a csv file (known file format).

except block: We want to load a file from hugging face which is not one of the known file formats (say a fasta/pdb/mmcif file), like the fasta file which Jakob provided. If it was used in the above `try1 block, it would throw an error. This block will also work if any local files are supplied to the loader and would return a Dataset object.

Copy link
Contributor

@fkiraly fkiraly Oct 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still do not get what scenario we would need the try/except in.

Does the user not always know what format the file is in that they are attempting to load? And in cases where we hardcode the dataset, do we not know?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the user not always know what format the file is in that they are attempting to load?

The function has to be used differently depending on whether or not the file is a recognisable file format. This information is something I discovered after I spent a bit of time looking into it, the function is just making life easier and saving users from looking into it themselves.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you please give an example of a scenario where this would be useful?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

try block: We want to load a file from hugging face which is in one of the known file formats supported by hugging face, so something like the imbd dataset, which is a csv file (known file format).

except block: We want to load a file from hugging face which is not one of the known file formats (say a fasta/pdb/mmcif file), like the fasta file which Jakob provided. If it was used in the above `try1 block, it would throw an error. This block will also work if any local files are supplied to the loader and would return a Dataset object.

The try case would need a call like load_dataset("imbd") whereas in the except case it would need a call like load_dataset("text", data_files="https://huggingface.co/datasets/gcos/HoloRBP4_round8_trimmed")

"""
Load any Hugging Face dataset or file into a `datasets.Dataset`.

This function first attempts to load the dataset natively using
`datasets.load_dataset`. If the dataset format is unsupported, it falls back
to loading the file as plain text via the "text" dataset loader.

Parameters
----------
path : str
Path or identifier for the dataset. Can be:

- A Hugging Face Hub dataset name (e.g. "imdb", "username/dataset_name").
- A local dataset path.
- A URL to a dataset file.

keep_in_memory : bool, default=True
Whether to keep the dataset in memory instead of writing to disk cache.
**kwargs : dict
Additional keyword arguments passed to `datasets.load_dataset`.

Returns
-------
datasets.Dataset or datasets.DatasetDict

- If the source provides multiple splits (e.g. "train", "test"),
a `datasets.DatasetDict` is returned.
- If only a single split is present, the corresponding
`datasets.Dataset` is returned directly (this function unwraps
single-split DatasetDicts automatically).

"""
ext = os.path.splitext(str(path))[-1].lstrip(".").lower()

if ext not in FILE_FORMATS:
ds = load_dataset(path, keep_in_memory=keep_in_memory, **kwargs)
else:
ds = load_dataset(
"text",
data_files=path,
keep_in_memory=keep_in_memory,
**kwargs,
)

# Unwrap single-split DatasetDict into Dataset
if isinstance(ds, dict) and len(ds) == 1:
ds = next(iter(ds.values()))

return ds
18 changes: 18 additions & 0 deletions pyaptamer/utils/tests/test_fasta_to_aaseq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pandas as pd

from pyaptamer.utils import fasta_to_aaseq


def test_huggingface_url_fetch():
"""Integration test: fetch FASTA from a Hugging Face dataset URL."""
url = "https://huggingface.co/datasets/gcos/HoloRBP4_round8_trimmed/resolve/main/HoloRBP4_round8_trimmed.fasta"

sequences = fasta_to_aaseq(url)
assert isinstance(sequences, list)
assert len(sequences) > 0
assert all(isinstance(seq, str) and seq for seq in sequences)

df = fasta_to_aaseq(url, return_df=True)
assert isinstance(df, pd.DataFrame)
assert not df.empty
assert all(df["sequence"].str.len() > 0)
20 changes: 20 additions & 0 deletions pyaptamer/utils/tests/test_hf_to_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os

from pyaptamer.utils import hf_to_dataset


def test_hf_hub_dataset_load():
"""Test loading a known Hugging Face Hub dataset (small)."""
ds = hf_to_dataset(
"https://huggingface.co/datasets/gcos/HoloRBP4_round8_trimmed/resolve/main/HoloRBP4_round8_trimmed.fasta"
)
assert "text" in ds.column_names


def test_load_pdb_local_file():
"""Test parsing a local PDB file (pfoa.pdb) from the data folder."""
pdb_file = os.path.join(
os.path.dirname(__file__), "..", "..", "datasets", "data", "pfoa.pdb"
)
ds = hf_to_dataset(pdb_file)
assert "text" in ds.column_names
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies = [
"scikit-learn>=1.3.0",
"skorch",
"imblearn",
"datasets",
]

[project.optional-dependencies]
Expand Down