Source code for pyproteome.loading

'''
This module provides functionality for loading data sets.

Functionality includes loading CAMV and Proteome Discoverer data sets.
'''

# Built-ins
import logging

# Core data analysis libraries
import numpy as np

from . import camv, discoverer


LOGGER = logging.getLogger('pyproteome.loading')


def _calculate_rejected(psms, accepted, maybed, rejected):
    if rejected is None:
        return psms

    LOGGER.info('Filtering out rejected scans.')

    # Remove any peptides that match the scan number and sequence
    # in the rejected list.
    reject_mask = np.zeros(psms.shape[0], dtype=bool)
    validations = psms['Validated'].copy()

    for index, row in psms.iterrows():
        # Check if this specific sequence and scan was rejected
        hit = np.logical_and(
            # Assuming Scan always == Last Scan
            rejected['Scan'] == row['Scan'],
            rejected['Sequence'] == row['Sequence'],
        )

        if hit.any():
            reject_mask[index] = True
            continue

        if accepted is not None:
            if np.logical_and(
                accepted['Scan'] == row['Scan'],
                accepted['Sequence'] == row['Sequence'],
            ).any():
                validations[index] = True
                continue

        if maybed is not None:
            if np.logical_and(
                maybed['Scan'] == row['Scan'],
                maybed['Sequence'] == row['Sequence'],
            ).any():
                continue

        # Check if this scan was rejected and no sequences were accepted
        hit = (rejected['Scan'] == row['Scan']).any()
        if not hit:
            continue

        reject_mask[index] = True

    psms['Validated'] = validations
    psms = psms[~reject_mask].reset_index(drop=True)

    return psms


def _calculate_accepted(psms, accepted):
    if accepted is None:
        return psms

    LOGGER.info('Filtering out non-accepted scans.')

    reject_mask = np.zeros(psms.shape[0], dtype=bool)
    validations = psms['Validated'].copy()

    for index, row in psms.iterrows():
        # Reject hits where the scan number is the same but the sequence
        # is different.
        hit = np.logical_and(
            accepted['Scan'] == row['Scan'],
            accepted['Sequence'] != row['Sequence'],
        )
        if hit.any():
            reject_mask[index] = True

        hit = np.logical_and(
            accepted['Scan'] == row['Scan'],
            accepted['Sequence'] != row['Sequence'],
        )
        if hit.any():
            validations[index] = True

    psms['Validated'] = validations
    psms = psms[~reject_mask].reset_index(drop=True)

    return psms


[docs]def load_psms(basename, pick_best_psm=True):
    '''
    Load a list of peptide-spectrum matches (PSMs) from a .msf file produced by
    Proteome Discoverer.

    Parameters
    ----------
    basename : str
        Base name of the data set (i.e. 'CK-H1-pY' for 'CK-H1-pY.msf').
    pick_best_psm : bool, optional
        Select the best scoring PSM for a given scan, otherwise load all PSMs.

    Returns
    -------
    psms : :class:`pandas.DataFrame`
    '''
    # The load CAMV data to clear unwanted hits if available.
    accepted, maybed, rejected = camv.load_camv_validation(basename)
    lst = (accepted, maybed, rejected)

    psms, species = discoverer.read_discoverer_msf(
        basename,
        pick_best_psm=(
            pick_best_psm and
            all([not i for i in lst])
        ),
    )

    psms = _calculate_rejected(psms, accepted, maybed, rejected)
    psms = _calculate_accepted(psms, accepted)

    return psms, species, lst