Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyaptamer/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
from pyaptamer.datasets._loaders._online_databank import load_from_rcsb
from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
from pyaptamer.datasets._loaders._pfoa import load_pfoa_structure

__all__ = [
"load_pfoa_structure",
Expand Down
2 changes: 1 addition & 1 deletion pyaptamer/datasets/_loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Loaders for different data structures."""

from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
from pyaptamer.datasets._loaders._pfoa import load_pfoa_structure

__all__ = ["load_pfoa_structure", "load_1gnh_structure"]
43 changes: 43 additions & 0 deletions pyaptamer/utils/pdb_to_aaseq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
__author__ = "satvshr"
__all__ = ["pdb_to_aaseq"]

import os

import pandas as pd
from Bio import SeqIO


def pdb_to_aaseq(pdb_file_path, return_df=False):
"""
Extract amino-acid sequences (SEQRES) from a PDB file.

Parameters
----------
pdb_file_path : str or os.PathLike
Path to a PDB file.
return_df : bool, optional, default=False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's call this return_type with two possible values "pd.df" and "list". This makes it more upwards compatible.

If True, return a pandas.DataFrame with columns:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's try to be rst compatible where we can:

  • newlines before and after bullet point lists
  • double backticks around code like 'chain' (note that markdown uses single backtick, rst wants double backtick)

- 'chain' (if available) and
- 'sequence' (one-letter amino-acid string per chain).
If False, return a list of strings (one per chain).

Returns
-------
list of str or pandas.DataFrame
List of amino-acid sequences (one-letter codes), or a DataFrame containing the
sequences.extracted from the SEQRES records in the PDB file. Each element is a
string representing the full sequence for a chain (e.g. "MKWVTFISLL..."). The
order of sequences matches the order in which SEQRES records are encountered in
the file. Returns an empty list if no SEQRES records are found.
"""
pdb_path = os.fspath(pdb_file_path)
sequences = []

with open(pdb_path) as handle:
for record in SeqIO.parse(handle, "pdb-seqres"):
sequences.append(str(record.seq))

if return_df:
return pd.DataFrame({"sequence": sequences})

return sequences
32 changes: 32 additions & 0 deletions pyaptamer/utils/tests/test_pdb_to_aaseq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
__author__ = "satvshr"

import os

from pyaptamer.utils.pdb_to_aaseq import pdb_to_aaseq


def test_pdb_to_aaseq():
"""
Test that `pdb_to_aaseq` converts a PDB file path into a non-empty string
containing alphabetic characters.
"""
pdb_file_path = os.path.join(
os.path.dirname(__file__), "..", "..", "datasets", "data", "1gnh.pdb"
)
sequences = pdb_to_aaseq(pdb_file_path)

assert isinstance(sequences, list), "pdb_to_aaseq should return a list"
assert len(sequences) > 0, "Returned list should not be empty"

for seq in sequences:
assert isinstance(seq, str), "Each entry should be a string"
assert len(seq) > 0, "Each sequence string should not be empty"

sequences = pdb_to_aaseq(pdb_file_path, return_df=True)

assert not sequences.empty, "Returned DataFrame should not be empty"
assert "sequence" in sequences.columns, "DataFrame should have a 'sequence' column"

for seq in sequences["sequence"]:
assert isinstance(seq, str), "Each entry should be a string"
assert len(seq) > 0, "Each sequence string should not be empty"
Loading