Source code for pyproteome.levels

'''
This module provides functionality for normalizing protein data.

Levels can be extracted from supernatant or phosphotyrosine runs using median
or mean peptide levels across multiple channels.
'''

from __future__ import absolute_import, division

# Built-ins
from collections import OrderedDict
import logging
import warnings

# Core data analysis libraries
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats

LOGGER = logging.getLogger('pyproteome.levels')
WARN_PEP_CUTOFF = 50
REL_CUTOFF = 5
LEVELS_DPI = 90


[docs]def kde_max(points):
    '''
    Estimate the center of a quantification channel by fitting a gaussian
    KDE function and finding its maximum.

    Parameters
    ----------
    points : list of float

    Returns
    -------
    float
    '''
    points = points[~np.isnan(points)]
    gaus = stats.kde.gaussian_kde(points, bw_method='silverman')
    # .25 - .75 1000 slices
    # x = np.arange(0, 10, .01)
    x = np.linspace(np.quantile(points, .15), np.quantile(points, .85), 1000)
    y = np.array(gaus.pdf(x))
    return x[y == y.max()][0]


[docs]def get_channel_levels(
    data,
    norm_channels=None,
    method='median',
    cols=2,
):
    '''
    Calculate channel normalization levels. This value is calculated by
    selecting the peak of Gaussian KDE distribution fitted to channel ratio
    values.

    Parameters
    ----------
    data : :class:`pyproteome.data_sets.DataSet`
    norm_channels : list of str, optional
        Sample names of channels to use for normalization.
    method : str, optional
        Normalize to the 'mean' or 'median' of each row.
    cols : int, optional
        Number of columns used when displaying KDE distributions.

    Returns
    -------
    fig : :class:`matplotlib.figure.Figure`
    channel_levels : dict of str, float
    '''
    if norm_channels is None:
        norm_channels = list(data.channels.keys())

    channels = [data.channels[i] for i in norm_channels]
    channel_levels = OrderedDict()

    rows = int(np.ceil(len(data.channels) / cols))
    scale = 3
    f, axes = plt.subplots(
        rows, cols,
        sharex=True,
        sharey=True,
        figsize=(scale * cols, scale * rows),
        dpi=LEVELS_DPI,
    )
    axes = [i for j in axes for i in j]
    ax_iter = iter(axes)

    if method in ['mean']:
        norm = data.psms[channels].mean(axis=1)
    elif method in ['median']:
        norm = data.psms[channels].median(axis=1)
    else:
        raise Exception('Unknown normalization method: {}'.format(method))

    for col_name, col in zip(norm_channels, channels):
        points = (data.psms[col] / norm).dropna()

        # Remove peptides that change more than 25x
        points = points[
            (points >= 1 / REL_CUTOFF) &
            (points <= REL_CUTOFF)
        ]

        if points.shape[0] < WARN_PEP_CUTOFF:
            LOGGER.warning(
                (
                    '{}: Too few peptides for normalization, '
                    'quantification may be inaccurate '
                    ' ({} peptides for {}: {})'
                ).format(data.name, points.shape[0], col_name, col)
            )

        if points.shape[0] < 1:
            channel_levels[col] = 1
            continue
        else:
            # Fit a guassian and find its maximum
            max_x = kde_max(points)

        channel_levels[col] = max_x

        ax = next(ax_iter)

        # seaborn==0.9.0 throws a scipy.stats warning
        with warnings.catch_warnings():
            warnings.filterwarnings(
                'ignore',
                '',
                FutureWarning,
            )
            sns.distplot(
                points.apply(np.log2),
                bins=25,
                ax=ax,
            )

        ax.set_xlim(left=-2, right=2)

        ax.set_title(
            '{} ({})'.format(col_name, col)
            if isinstance(data.channels, dict) else
            col,
        )

        txt = 'center = {:.2f}\n$\\sigma$ = {:.2f}'.format(
            max_x,
            points.std(ddof=1),
        )
        lines = [
            ax.axvline(np.log2(points.median()), color='g', zorder=2, linestyle=':'),
            ax.axvline(np.log2(points.mean()), color='r', zorder=3, linestyle='-.'),
            ax.axvline(np.log2(max_x), color='c', zorder=4, linestyle='--'),
            ax.axvline(np.log2(1), color='k', linestyle='-', zorder=0, lw=2),
        ]
        if col_name is norm_channels[0]:
            ax.legend(lines[:-1], ['kde max', 'mean', 'median'])

        ax.text(
            s=txt,
            x=ax.get_xlim()[1] * .9,
            y=1,
            fontsize=10,
            color='k',
            horizontalalignment='right',
            verticalalignment='center',
        ).set_bbox(
            dict(
                alpha=.75,
                linewidth=0.25,
                facecolor='white',
                zorder=1,
                edgecolor='black',
                boxstyle='round',
            )
        )

    for ax in ax_iter:
        ax.axis('off')

    f.suptitle(
        '{}'.format(data.name),
        fontsize=16,
        y=.92,
    )

    return f, channel_levels