Source code for pyproteome.data_sets.constand

'''
This module provides functionality for manipulating proteomics data sets.

Functionality includes merging data sets and interfacing with attributes in a
structured format.
'''

# Built-ins
from __future__ import absolute_import, division

from pyproteome import levels

import logging

# Core data analysis libraries
import numpy as np


LOGGER = logging.getLogger('pyproteome.constand')
CONSTAND_METHODS = {
    'mean': np.nanmean,
    'median': np.nanmedian,

    # Fit a gaussian function to each row/column and select its max point
    'kde': lambda k, **kw: np.apply_along_axis(levels.kde_max, kw['axis'], k),
}
'''
Methods to estimate the center of a data set's rows or columns.

One of ['mean', 'median', 'kde']. 'mean': applies :func:`numpy.nanmean`,
'median' applies :func:`numpy.nanmedian`, and 'kde' applies
:func:`.levels.kde_max`.
'''

CONSTAND_ERR_METHODS = {
    'mean': lambda x, **kw: np.sqrt(abs(np.nanmean(x, **kw) - 1)).sum() / 2,
    'median': lambda x, **kw: np.sqrt(abs(np.nanmedian(x, **kw) - 1)).sum() / 2,
    'kde': lambda x, **kw: np.sqrt(abs(np.apply_along_axis(levels.kde_max, kw['axis'], x) - 1)).sum() / 2,
}
DEFAULT_CONSTAND_ROW = 'mean'
DEFAULT_CONSTAND_COL = 'median'


[docs]def constand(
    ds,
    inplace=False,
    n_iters=25, 
    tol=1e-5,
    row_method=None,
    col_method=None,
):
    '''
    Normalize channels for intra-run comparisons. Iteratively fits the matrix
    of quantification values such that each row and column are centered around
    a calculated value. Uses row means and column median values for centering
    by default. See `constand.CONSTAND_METHODS` for other options.

    Parameters
    ----------
    ds : :class:`.data_sets.DataSet`
        Data set to apply CONSTANd normalization on.
    inplace : bool, optional
        Modify this data set in place.
    n_iters : int, optional
        Max number of normalization iterations. Rows are normalized on
        the odd step and columns are normalized on the even step.
    tol : float, optional
        Minimum error tolerance to use to end iterations early.
    row_method : str, optional
        Row normalization method to use. Default value is 'mean'.
    col_method : str, optional
        Column normalization method to use. Default value is 'median'.

    Returns
    -------
    ds : :class:`.data_sets.DataSet`
    '''
    new = ds

    if row_method is None:
        row_method = DEFAULT_CONSTAND_ROW

    if col_method is None:
        col_method = DEFAULT_CONSTAND_COL

    if not inplace:
        new = new.copy()

    channels = list(new.channels.values())

    new_channels = new.channels

    k = new[channels].values
    err = np.inf
    
    row_fn = lambda x: CONSTAND_METHODS[row_method](x, axis=1)
    col_fn = lambda x: CONSTAND_METHODS[col_method](x, axis=0)

    row_err_fn = lambda x: CONSTAND_ERR_METHODS[col_method](x, axis=0)
    col_err_fn = lambda x: CONSTAND_ERR_METHODS[row_method](x, axis=1)

    for ind in range(1, n_iters + 1):
        if ind % 2 == 1:
            # In the odd step the rows are fitted to match the row marginals
            # (i.e. constraints):
            #
            # K^(2t + 1) = R^(t + 1) * K ^ (2t)
            #
            # The row multipliers R^(t + 1) are computed such that the mean
            # of the reporter ion intensities equals 1:
            r = 1 / row_fn(k)

            # k^(2t + 1)
            k = np.einsum('..., ...', r, k.T).T
            err = row_err_fn(k)
        else:
            # In the even step the columns are fitted to match the column
            # marginals (i.e. constraints):
            #
            # K^(2t + 2) = K^(2t + 1) * S^(t + 1)
            #
            # The column multipliers S^(t + 1) are computed such that the
            # mean of the reporter ion intensities equals 1:
            s = 1 / col_fn(k)

            # k^(2t + 2)
            k = np.einsum('..., ...', k, s)
            err = col_err_fn(k)

        # print(ind, err)

        if err < tol:
            break

    LOGGER.info(
        '{}Applied CONSTANd normalization: iters = {}, err = {:.2e}.'.format(
            new.name + ': ' if new.name else '',
            ind,
            err,
        )
    )

    for i, (key, norm_key) in enumerate(zip(
        new.channels.values(),
        new_channels.values(),
    )):
        new.psms[norm_key] = k[:, i]

        if key != norm_key:
            del new.psms[key]

    new.channels = new_channels
    new.groups = new.groups.copy()

    new.update_group_changes()

    return new