Source code for pyproteome.data_sets.modification

'''
This module provides functionality for post-translational modifications.

Wraps modifications in a structured class and allows filtering of
modifications by amino acid and modification type.
'''

from collections import defaultdict
import copy

LABEL_NAME_TARGETS = (
    'TMT', 'ITRAQ', 'plex',
)
'''
Substrings used to identify and import novel label names from .msf files.
'''

LABEL_NAMES = defaultdict(set)
LABEL_NAMES['TMT10'].add('K')
LABEL_NAMES['TMT10'].add('N-term')
LABEL_NAMES['TMT6'].add('K')
LABEL_NAMES['TMT6'].add('N-term')
'''
Names of modifications used for quantification of peptide abundances.
'''
MERGE_UNDERLABELED = True
'''
Merge peptides that have satured TMT labeling with peptides that are underlabeled.
'''


[docs]class Modifications:
    '''
    A list of modifications.

    Wraps the Modification objects and provides several utility functions.

    Attributes
    ----------
    mods : list of :class:`.Modification`
    '''

    def __init__(self, mods=None):
        '''
        Initialize from a list of modifications.

        Parameters
        ----------
        mods : list of :class:`.Modification`
        '''
        self.mods = mods or ()

    def __iter__(self):
        return iter(self.mods)

    def __len__(self):
        return len(self.mods)

[docs]    def copy(self):
        '''
        Creates a copy of a set of modifications. Does not copy the underlying
        sequence object.

        Returns
        -------
        mods : :class:`.Modifications`
        '''
        new = copy.copy(self)
        new.mods = tuple(i.copy() for i in new.mods)
        return new

[docs]    def skip_labels(self):
        '''
        Get modifications, skipping over any that are peptide labels.

        Returns
        -------
        mods : list of :class:`.Modification`
        '''
        return [
            mod
            for mod in self.mods
            if not any(label in mod.mod_type for label in LABEL_NAMES)
        ]

[docs]    def get_mods(self, letter_mod_types):
        '''
        Filter the list of modifications.

        Only keeps modifications with a given letter, mod_type, or both.

        Examples
        --------
        >>> from pyproteome.sequence import Sequence
        >>> from pyproteome.modification import Modification, Modifications
        >>> s = Sequence(pep_seq='SVYTEIK')
        >>> m = Modifications(
        ...     [
        ...         Modification(mod_type='TMT', nterm=True, sequence=s),
        ...         Modification(mod_type='Phospho', rel_pos=2, sequence=s),
        ...         Modification(mod_type='TMT', rel_pos=6, sequence=s),
        ...     ]
        ... )
        >>> m.get_mods('TMT')
        ['TMT A0', 'TMT K6']
        >>> m.get_mods('Phospho')
        ['pY2']
        >>> m.get_mods('Y')
        ['pY2']
        >>> m.get_mods('S')
        []
        >>> m.get_mods([('Y', 'Phospho')])
        ['pY2']
        >>> m.get_mods([('S', 'Phospho')])
        []

        Parameters
        ----------
        letter_mod_types : list of tuple of str, str

        Returns
        -------
        mods : :class:`.Modifications`
        '''
        any_letter, any_mod, letter_mod = \
            _extract_letter_mods(letter_mod_types)
        return Modifications(
            tuple(
                mod
                for mod in self.mods
                if allowed_mod_type(
                    mod,
                    any_letter=any_letter,
                    any_mod=any_mod,
                    letter_mod=letter_mod,
                )
            )
        )

    def __hash__(self):
        return hash(
            tuple(
                sorted(self.skip_labels(), key=lambda x: x.to_tuple())
                # sorted(self.mods, key=lambda x: x.to_tuple())
            ),
        )

    def __eq__(self, other):
        if not isinstance(other, Modifications):
            raise TypeError()

        if MERGE_UNDERLABELED:
            self_mods = sorted(self.skip_labels(), key=lambda x: x.to_tuple())
            o_mods = sorted(other.skip_labels(), key=lambda x: x.to_tuple())
        else:
            self_mods = sorted(self.mods, key=lambda x: x.to_tuple())
            o_mods = sorted(other.mods, key=lambda x: x.to_tuple())

        return tuple(self_mods) == tuple(o_mods)

    def __lt__(self, other):
        return self.skip_labels() < other.skip_labels()

    def __repr__(self, absolute=True, skip_labels=True):
        return self.__str__(absolute=absolute, skip_labels=skip_labels)

    def __str__(
        self, 
        absolute=True, 
        skip_labels=True, 
        prot_index=None, 
        show_mod_type=True,
    ):
        if len(self.mods) == 0:
            return ''

        if skip_labels:
            lst = list(self.skip_labels())
        else:
            lst = list(iter(self))

        if not lst:
            return ''

        def _mod_prot(i):
            return ', '.join(
                '{}{}{}{}'.format(
                    mod.display_mod_type() if show_mod_type else '',
                    mod.letter,
                    1 + (mod.abs_pos[i] if absolute else mod.rel_pos),
                    '' if mod.exact[i] else '*'
                )
                for mod in lst
            )

        if prot_index is None:
            return ' / '.join(
                _mod_prot(i)
                for i in range(len(lst[0].exact))
            )
        else:
            return _mod_prot(prot_index)


[docs]class Modification:
    '''
    Contains information for a single peptide modification.

    Attributes
    ----------
    rel_pos : int
        The relative position of a modification in a peptide sequence
        (0-indexed).
    mod_type : str
        A short name for this type of modification (i.e. 'Phospho',
        'Carbamidomethyl', 'Oxidation', 'TMT6', 'TMT10')
    nterm : bool
        Boolean indicator of whether this modification is applied to the
        peptide N-terminus.
    cterm : bool
        Boolean indicator of whether this modification is applied to the
        peptide C-terminus.
    '''

    def __init__(
        self,
        rel_pos=0,
        mod_type='',
        sequence=None,
        nterm=False,
        cterm=False,
    ):
        self.rel_pos = rel_pos
        self.mod_type = mod_type
        self.nterm = nterm
        self.cterm = cterm
        self.sequence = sequence

[docs]    def display_mod_type(self):
        '''
        Return the mod_type in an abbreviated form (i.e. 'p' for 'Phospho')

        Returns
        -------
        abbrev : str
        '''
        if self.mod_type in ['Phospho']:
            return 'p'
        if self.mod_type in ['Carbamidomethyl']:
            return 'cm'
        if self.mod_type in ['Oxidation']:
            return 'ox'

        return self.mod_type

[docs]    def to_tuple(self):
        return (
            self.rel_pos,
            self.mod_type,
            self.nterm,
            self.cterm,
            self.letter,
            self.abs_pos,
            self.exact,
        )

    def __hash__(self):
        return hash(self.to_tuple())

    def __lt__(self, other):
        return self.to_tuple() < other.to_tuple()

    def __eq__(self, other):
        if not isinstance(other, Modification):
            raise TypeError()

        return self.to_tuple() == other.to_tuple()

[docs]    def copy(self):
        '''
        Creates a copy of a modification. Does not copy the underlying sequence
        object.

        Returns
        -------
        mod : :class:`.Modification`
        '''
        new = copy.copy(self)
        return new

    @property
    def letter(self):
        '''
        This modification's one-letter amino acid code (i.e. 'Y'), or 'N-term'
        / 'C-term' for terminal modifications.

        Returns
        -------
        letter : str
        '''
        if self.sequence is None:
            return ''

        if self.nterm:
            return 'N-term'
        elif self.cterm:
            return 'C-term'

        return self.sequence.pep_seq[self.rel_pos].upper()

    @property
    def abs_pos(self):
        '''
        The absolute positions of this modification in the full sequence
        of each mapped protein (0-indexed).

        Returns
        -------
        tuple of int
        '''
        if self.sequence is None:
            return ()

        return tuple(
            self.rel_pos + match.rel_pos
            for match in self.sequence.protein_matches
        )

    @property
    def exact(self):
        '''
        Indicates whether each peptide-protein mapping for this modification is
        an exact or partial match.

        Returns
        -------
        exact : tuple of bool
        '''
        if self.sequence is None:
            return ()

        return tuple(
            match.exact
            for match in self.sequence.protein_matches
        )

    def __repr__(self):
        return (
            '<Modification {}{}({})>'
        ).format(
            self.letter,
            (self.rel_pos + 1) if not self.cterm and not self.nterm else '',
            self.mod_type,
        )


[docs]def allowed_mod_type(mod, any_letter=None, any_mod=None, letter_mod=None):
    '''
    Check if a modification is of a given type.

    Filters by letter, mod_type, or both.

    Parameters
    ----------
    mod : :class:`.Modification`
    any_letter : set of str
    any_mod : set of str
    letter_mod : set of tuple of str, str

    Returns
    -------
    is_type : bool
    '''
    return (
        (
            any_letter is None or
            mod.letter in any_letter
        ) or (
            any_mod is None or
            mod.mod_type in any_mod
        ) or (
            letter_mod is None or
            (mod.letter, mod.mod_type) in letter_mod
        )
    )


def _extract_letter_mods(letter_mod_types=None):
    if letter_mod_types is None:
        return None, None, None

    if isinstance(letter_mod_types, str):
        letter_mod_types = (letter_mod_types,)

    any_letter = set()
    any_mod = set()
    letter_mod = set()

    for elem in letter_mod_types:
        if not isinstance(elem, tuple):
            if len(elem) == 1:
                any_letter.add(elem.upper())
            else:
                any_mod.add(elem)

            continue

        letter, mod_type = elem

        if letter is None and mod_type is None:
            raise Exception('Need at least one letter or mod type not None')
        elif letter is None and mod_type is not None:
            any_mod.add(mod_type)
        elif letter is not None and mod_type is None:
            any_letter.add(letter.upper())
        else:
            letter_mod.add((letter.upper(), mod_type))

    return any_letter, any_mod, letter_mod