'''
This module provides functionality for manipulating sequences.
'''
# Built-ins
import logging
from . import modification, protein
import pyproteome as pyp
LOGGER = logging.getLogger('pyproteome.sequence')
[docs]class ProteinMatch:
'''
Contains information about how a peptide sequence maps onto a protein.
Attributes
----------
protein : :class:`.protein.Protein`
Protein object.
rel_pos : int
Relative position of the peptide start within the protein sequence.
exact : bool
Indicates whether a peptide sequence exact matches its protein sequence.
'''
def __init__(self, protein, rel_pos, exact):
self.protein = protein
self.rel_pos = rel_pos
self.exact = exact
[docs] def to_tuple(self):
return (
self.protein,
self.rel_pos,
self.exact,
)
def __hash__(self):
return hash(self.to_tuple())
def __lt__(self, other):
return self.protein < other.protein
def __eq__(self, other):
if not isinstance(other, ProteinMatch):
raise TypeError(other)
return self.to_tuple() == other.to_tuple()
[docs]class Sequence:
'''
Contains information about a sequence and which proteins it matches to.
Attributes
----------
pep_seq : str
Peptide sequence, in 1-letter amino code.
protein_matches : list of :class:`.ProteinMatch`
Object mapping all proteins that a peptide sequence matches.
modifications : :class:`.modification.Modifications`
Object listing all post-translation modifications identified on a peptide.
'''
def __init__(
self,
pep_seq='',
protein_matches=None,
modifications=None,
):
'''
Parameters
----------
pep_seq : str
protein_matches : list of :class:`.ProteinMatch`
modifications : :class:`.modification.Modifications`, optional
'''
if protein_matches is None:
protein_matches = ()
self.pep_seq = pep_seq
self.protein_matches = tuple(sorted(protein_matches))
self.modifications = modifications
self._is_labeled = None
self._is_underlabeled = None
[docs] def to_tuple(self):
return (
self.pep_seq.upper(),
self.modifications,
)
def __hash__(self):
return hash(self.to_tuple())
def __eq__(self, other):
# In case of searching just by sequence
if isinstance(other, str):
return other in [
self.__str__(),
self.__str__(show_mods=True),
self.__str__(skip_labels=False, skip_terminus=False),
self.__str__(
skip_labels=False,
skip_terminus=False,
show_mods=True,
),
]
if not isinstance(other, Sequence):
raise TypeError(other)
if self.pep_seq.upper() != other.pep_seq.upper():
return False
if tuple(self.protein_matches) != tuple(other.protein_matches):
return False
return self.to_tuple() == other.to_tuple()
def __lt__(self, other):
return self.pep_seq < other.pep_seq
def __contains__(self, other):
if isinstance(other, str):
return any([
other in i
for i in [
self.__str__(),
self.__str__(skip_labels=True, skip_terminus=False),
]
])
if not isinstance(other, Sequence):
raise TypeError(type(other))
self_mods = list(self.modifications.skip_labels())
other_mods = list(other.modifications.skip_labels())
return (
other.pep_seq.upper() in self.pep_seq.upper() and
len(other.protein_matches) == len(self.protein_matches) and
all(
i.protein == j.protein
for i, j in zip(other.protein_matches, self.protein_matches)
) and
len(other_mods) == len(self_mods) and
all(
i.mod_type == j.mod_type and
i.abs_pos == j.abs_pos and
i.nterm == j.nterm and
i.cterm == j.cterm
for i, j in zip(
other_mods,
self_mods,
)
)
)
def __str__(
self,
skip_labels=True,
skip_terminus=True,
mods=None,
show_mods=False,
):
'''
Converts a peptide into a string.
Parameters
----------
skip_labels : bool, optional
Don't include TMT/iTRAQ quantitification tags in string.
skip_terminus : bool, optional
Don't show N-/C-terminal modifications.
mods : list of str, optional
Only show this subset of modifications (i.e. ['Phospho', 'Oxidation']).
show_mods : bool, optional
If true, show modification identities (i.e. 'y(Phospho)').
Otherwise residues with modifications are shown as lowercase.
Returns
-------
str
'''
string = list('N-' + self.pep_seq.upper() + '-C')
self_mods = self.modifications
if skip_terminus:
string = string[2:-2]
if mods:
self_mods = self_mods.get_mods(mods)
if skip_labels:
self_mods = self_mods.skip_labels()
def _mods(index, letter):
lst = [
mod
for mod in self_mods
if (mod.rel_pos == index)
or (mod.nterm and index < 0)
or (mod.cterm and index > len(self.pep_seq))
]
if not lst or letter == '-':
return letter
if not show_mods:
return letter.lower()
return (
letter.lower() +
'({})'.format(', '.join([mod.mod_type for mod in lst]))
)
string = [
_mods(ind, letter)
for ind, letter in enumerate(
string,
start=0 if skip_terminus else -2,
)
]
return ''.join(string)
def __len__(self):
return len(self.pep_seq)
@property
def is_labeled(self):
'''
Checks whether a sequence is modified on any residue with a
quantification label.
Returns
-------
is_labeled : bool
'''
if self._is_labeled is not None:
return self._is_labeled
val = any(
j.mod_type in modification.LABEL_NAMES
for j in self.modifications.mods
)
self._is_labeled = val
return val
@property
def is_underlabeled(self):
'''
Checks whether a sequence is modified with quantification labels on
fewer than all expected residues.
Returns
-------
is_underlabeled : bool
'''
if self._is_underlabeled is not None:
return self._is_underlabeled
underlabeled = False
if self.is_labeled:
# XXX: Hardcodes label modification locations, not extendable to
# new quantification tags without changes to this function
underlabeled = not any(
j.mod_type in modification.LABEL_NAMES and j.nterm
for j in self.modifications.mods
) or self.pep_seq.count('K') != sum(
j.mod_type in modification.LABEL_NAMES
for j in self.modifications.mods
if j.letter == 'K' and not j.nterm
)
self._is_underlabeled = underlabeled
return underlabeled