Source code for pyproteome.data_sets.sequence

'''
This module provides functionality for manipulating sequences.
'''

# Built-ins
import logging

from . import modification, protein

import pyproteome as pyp


LOGGER = logging.getLogger('pyproteome.sequence')


[docs]class ProteinMatch:
    '''
    Contains information about how a peptide sequence maps onto a protein.

    Attributes
    ----------
    protein : :class:`.protein.Protein`
        Protein object.
    rel_pos : int
        Relative position of the peptide start within the protein sequence.
    exact : bool
        Indicates whether a peptide sequence exact matches its protein sequence.
    '''

    def __init__(self, protein, rel_pos, exact):
        self.protein = protein
        self.rel_pos = rel_pos
        self.exact = exact

[docs]    def to_tuple(self):
        return (
            self.protein,
            self.rel_pos,
            self.exact,
        )

    def __hash__(self):
        return hash(self.to_tuple())

    def __lt__(self, other):
        return self.protein < other.protein

    def __eq__(self, other):
        if not isinstance(other, ProteinMatch):
            raise TypeError(other)

        return self.to_tuple() == other.to_tuple()


[docs]class Sequence:
    '''
    Contains information about a sequence and which proteins it matches to.

    Attributes
    ----------
    pep_seq : str
        Peptide sequence, in 1-letter amino code.
    protein_matches : list of :class:`.ProteinMatch`
        Object mapping all proteins that a peptide sequence matches.
    modifications : :class:`.modification.Modifications`
        Object listing all post-translation modifications identified on a peptide.
    '''

    def __init__(
        self,
        pep_seq='',
        protein_matches=None,
        modifications=None,
    ):
        '''
        Parameters
        ----------
        pep_seq : str
        protein_matches : list of :class:`.ProteinMatch`
        modifications : :class:`.modification.Modifications`, optional
        '''
        if protein_matches is None:
            protein_matches = ()

        self.pep_seq = pep_seq
        self.protein_matches = tuple(sorted(protein_matches))
        self.modifications = modifications

        self._is_labeled = None
        self._is_underlabeled = None

[docs]    def to_tuple(self):
        return (
            self.pep_seq.upper(),
            self.modifications,
        )

    def __hash__(self):
        return hash(self.to_tuple())

    def __eq__(self, other):
        # In case of searching just by sequence
        if isinstance(other, str):
            return other in [
                self.__str__(),
                self.__str__(show_mods=True),
                self.__str__(skip_labels=False, skip_terminus=False),
                self.__str__(
                    skip_labels=False,
                    skip_terminus=False,
                    show_mods=True,
                ),
            ]

        if not isinstance(other, Sequence):
            raise TypeError(other)

        if self.pep_seq.upper() != other.pep_seq.upper():
            return False

        if tuple(self.protein_matches) != tuple(other.protein_matches):
            return False

        return self.to_tuple() == other.to_tuple()

    def __lt__(self, other):
        return self.pep_seq < other.pep_seq

    def __contains__(self, other):
        if isinstance(other, str):
            return any([
                other in i
                for i in [
                    self.__str__(),
                    self.__str__(skip_labels=True, skip_terminus=False),
                ]
            ])

        if not isinstance(other, Sequence):
            raise TypeError(type(other))

        self_mods = list(self.modifications.skip_labels())
        other_mods = list(other.modifications.skip_labels())

        return (
            other.pep_seq.upper() in self.pep_seq.upper() and
            len(other.protein_matches) == len(self.protein_matches) and
            all(
                i.protein == j.protein
                for i, j in zip(other.protein_matches, self.protein_matches)
            ) and
            len(other_mods) == len(self_mods) and
            all(
                i.mod_type == j.mod_type and
                i.abs_pos == j.abs_pos and
                i.nterm == j.nterm and
                i.cterm == j.cterm
                for i, j in zip(
                    other_mods,
                    self_mods,
                )
            )
        )

    def __str__(
        self, 
        skip_labels=True, 
        skip_terminus=True, 
        mods=None, 
        show_mods=False,
    ):
        '''
        Converts a peptide into a string.

        Parameters
        ----------
        skip_labels : bool, optional
            Don't include TMT/iTRAQ quantitification tags in string.
        skip_terminus : bool, optional
            Don't show N-/C-terminal modifications.
        mods : list of str, optional
            Only show this subset of modifications (i.e. ['Phospho', 'Oxidation']).
        show_mods : bool, optional
            If true, show modification identities (i.e. 'y(Phospho)').
            Otherwise residues with modifications are shown as lowercase.

        Returns
        -------
        str
        '''
        string = list('N-' + self.pep_seq.upper() + '-C')
        self_mods = self.modifications

        if skip_terminus:
            string = string[2:-2]

        if mods:
            self_mods = self_mods.get_mods(mods)
        if skip_labels:
            self_mods = self_mods.skip_labels()

        def _mods(index, letter):
            lst = [
                mod
                for mod in self_mods
                if (mod.rel_pos == index)
                or (mod.nterm and index < 0)
                or (mod.cterm and index > len(self.pep_seq))
            ]

            if not lst or letter == '-':
                return letter

            if not show_mods:
                return letter.lower()

            return (
                letter.lower() +
                '({})'.format(', '.join([mod.mod_type for mod in lst]))
            )

        string = [
            _mods(ind, letter)
            for ind, letter in enumerate(
                string,
                start=0 if skip_terminus else -2,
            )
        ]
        return ''.join(string)

    def __len__(self):
        return len(self.pep_seq)

    @property
    def is_labeled(self):
        '''
        Checks whether a sequence is modified on any residue with a
        quantification label.

        Returns
        -------
        is_labeled : bool
        '''
        if self._is_labeled is not None:
            return self._is_labeled

        val = any(
            j.mod_type in modification.LABEL_NAMES
            for j in self.modifications.mods
        )

        self._is_labeled = val

        return val

    @property
    def is_underlabeled(self):
        '''
        Checks whether a sequence is modified with quantification labels on
        fewer than all expected residues.

        Returns
        -------
        is_underlabeled : bool
        '''

        if self._is_underlabeled is not None:
            return self._is_underlabeled

        underlabeled = False

        if self.is_labeled:
            # XXX: Hardcodes label modification locations, not extendable to
            # new quantification tags without changes to this function
            underlabeled = not any(
                j.mod_type in modification.LABEL_NAMES and j.nterm
                for j in self.modifications.mods
            ) or self.pep_seq.count('K') != sum(
                j.mod_type in modification.LABEL_NAMES
                for j in self.modifications.mods
                if j.letter == 'K' and not j.nterm
            )

        self._is_underlabeled = underlabeled

        return underlabeled


[docs]def extract_sequence(proteins, sequence_string):
    '''
    Extract a Sequence object from a list of proteins and sequence string.

    Does not set the Sequence.modifications attribute.

    Parameters
    ----------
    proteins : list of :class:`.protein.Protein`
    sequence_string : str

    Returns
    -------
    seqs : list of :class:`.Sequence`
    '''
    prot_matches = []

    # Skip peptides with no protein matches
    if not isinstance(proteins, protein.Proteins):
        proteins = []

    def _get_rel_pos(protein, pep_seq):
        seq = protein.full_sequence

        if not seq:
            return 0, False

        pep_pos = seq.find(pep_seq)
        exact = True

        if pep_pos < 0:
            pep_pos = pyp.utils.fuzzy_find(pep_seq, seq)
            exact = False

        return pep_pos, exact

    for prot in proteins:
        rel_pos, exact = _get_rel_pos(prot, sequence_string.upper())

        prot_matches.append(
            ProteinMatch(
                protein=prot,
                rel_pos=rel_pos,
                exact=exact,
            )
        )

    return Sequence(
        pep_seq=sequence_string,
        protein_matches=prot_matches,
    )