'''
This module provides functionality for normalizing protein data.
Levels can be extracted from supernatant or phosphotyrosine runs using median
or mean peptide levels across multiple channels.
'''
from __future__ import absolute_import, division
# Built-ins
from collections import OrderedDict
import logging
import warnings
# Core data analysis libraries
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats
LOGGER = logging.getLogger('pyproteome.levels')
WARN_PEP_CUTOFF = 50
REL_CUTOFF = 5
LEVELS_DPI = 90
[docs]def kde_max(points):
'''
Estimate the center of a quantification channel by fitting a gaussian
KDE function and finding its maximum.
Parameters
----------
points : list of float
Returns
-------
float
'''
points = points[~np.isnan(points)]
gaus = stats.kde.gaussian_kde(points, bw_method='silverman')
# .25 - .75 1000 slices
# x = np.arange(0, 10, .01)
x = np.linspace(np.quantile(points, .15), np.quantile(points, .85), 1000)
y = np.array(gaus.pdf(x))
return x[y == y.max()][0]
[docs]def get_channel_levels(
data,
norm_channels=None,
method='median',
cols=2,
):
'''
Calculate channel normalization levels. This value is calculated by
selecting the peak of Gaussian KDE distribution fitted to channel ratio
values.
Parameters
----------
data : :class:`pyproteome.data_sets.DataSet`
norm_channels : list of str, optional
Sample names of channels to use for normalization.
method : str, optional
Normalize to the 'mean' or 'median' of each row.
cols : int, optional
Number of columns used when displaying KDE distributions.
Returns
-------
fig : :class:`matplotlib.figure.Figure`
channel_levels : dict of str, float
'''
if norm_channels is None:
norm_channels = list(data.channels.keys())
channels = [data.channels[i] for i in norm_channels]
channel_levels = OrderedDict()
rows = int(np.ceil(len(data.channels) / cols))
scale = 3
f, axes = plt.subplots(
rows, cols,
sharex=True,
sharey=True,
figsize=(scale * cols, scale * rows),
dpi=LEVELS_DPI,
)
axes = [i for j in axes for i in j]
ax_iter = iter(axes)
if method in ['mean']:
norm = data.psms[channels].mean(axis=1)
elif method in ['median']:
norm = data.psms[channels].median(axis=1)
else:
raise Exception('Unknown normalization method: {}'.format(method))
for col_name, col in zip(norm_channels, channels):
points = (data.psms[col] / norm).dropna()
# Remove peptides that change more than 25x
points = points[
(points >= 1 / REL_CUTOFF) &
(points <= REL_CUTOFF)
]
if points.shape[0] < WARN_PEP_CUTOFF:
LOGGER.warning(
(
'{}: Too few peptides for normalization, '
'quantification may be inaccurate '
' ({} peptides for {}: {})'
).format(data.name, points.shape[0], col_name, col)
)
if points.shape[0] < 1:
channel_levels[col] = 1
continue
else:
# Fit a guassian and find its maximum
max_x = kde_max(points)
channel_levels[col] = max_x
ax = next(ax_iter)
# seaborn==0.9.0 throws a scipy.stats warning
with warnings.catch_warnings():
warnings.filterwarnings(
'ignore',
'',
FutureWarning,
)
sns.distplot(
points.apply(np.log2),
bins=25,
ax=ax,
)
ax.set_xlim(left=-2, right=2)
ax.set_title(
'{} ({})'.format(col_name, col)
if isinstance(data.channels, dict) else
col,
)
txt = 'center = {:.2f}\n$\\sigma$ = {:.2f}'.format(
max_x,
points.std(ddof=1),
)
lines = [
ax.axvline(np.log2(points.median()), color='g', zorder=2, linestyle=':'),
ax.axvline(np.log2(points.mean()), color='r', zorder=3, linestyle='-.'),
ax.axvline(np.log2(max_x), color='c', zorder=4, linestyle='--'),
ax.axvline(np.log2(1), color='k', linestyle='-', zorder=0, lw=2),
]
if col_name is norm_channels[0]:
ax.legend(lines[:-1], ['kde max', 'mean', 'median'])
ax.text(
s=txt,
x=ax.get_xlim()[1] * .9,
y=1,
fontsize=10,
color='k',
horizontalalignment='right',
verticalalignment='center',
).set_bbox(
dict(
alpha=.75,
linewidth=0.25,
facecolor='white',
zorder=1,
edgecolor='black',
boxstyle='round',
)
)
for ax in ax_iter:
ax.axis('off')
f.suptitle(
'{}'.format(data.name),
fontsize=16,
y=.92,
)
return f, channel_levels