Source code for pyproteome.analysis.plot

'''
Plot calculated levels of a given sequence across channels or groups.
'''

from __future__ import division

# Built-ins
from collections import OrderedDict
import itertools
import logging

# Core data analysis libraries
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind

import pyproteome as pyp

LOGGER = logging.getLogger('pyproteome.plot')


[docs]def plot( data, title=None, ax=None, log_2=True, box=True, ): ''' Plot the levels of a sequence across multiple channels. Parameters ---------- data : :class:`pyproteome.data_sets.DataSet` title : str, optional figsize : tuple of (int, int), optional Returns ------- figs : list of :class:`matplotlib.figure.Figure` ''' cmp_groups = data.cmp_groups or [list(data.groups.keys())] channel_names = [ channel_name for groups in cmp_groups for group in groups for channel_name in data.groups[group] if channel_name in data.channels ] channel_names = list(OrderedDict.fromkeys(channel_names)) channels = [ data.channels[channel_name] for channel_name in channel_names ] figures = [] for _, row in data.psms.iterrows(): seq = str(row['Sequence']) values = row[channels] mask = ~pd.isnull(row[channels]) names = pd.Series(channel_names, index=values.index)[mask] values = values[mask] values = values / ( values[[ data.channels[i] for grp in cmp_groups for i in data.groups[grp[0]] if i in data.channels and data.channels[i] in values.index ]].median() if len(cmp_groups) > 0 else values[0] ) if ax is None: fig, ax_i = plt.subplots( figsize=(len(values) / 2, 6 / 2), ) else: ax_i = ax df = pd.DataFrame( [ ( name, val, [ group for groups in cmp_groups for group in groups if name in data.groups[group] ][0], ) for name, val in zip(names, values) ], columns=('name', 'val', 'group'), ) if log_2: df['val'] = df['val'].apply(np.log2) (sns.barplot if box else sns.barplot)( x='name', y='val', hue='group', data=df, ax=ax_i, dodge=False, ) ax_i.set_xticklabels( ax_i.get_xticklabels(), rotation=45, horizontalalignment='right', ) ax_i.set_xlabel('') ax_i.get_legend().set_title('') ax_i.axhline( np.log2(1) if log_2 else 1, linestyle=':', color='#4C4D4F', # alpha=.5, ) mod_str = row['Modifications'].__str__(prot_index=0) ax_i.set_title( title if title else '{} ({}{})'.format( seq, pyp.utils.get_name(row['Proteins'])[:20], (' ' + mod_str) if mod_str else '', ), ) ylabel = 'Intensity' # if data.cmp_groups: # else: # ylabel = ( # 'Cummulative Intensity' + # (' (Normalized)' if data.intra_normalized else '') # ) ax_i.set_ylabel( ylabel, ) figures.append((ax_i.get_figure(), ax_i)) return figures
def _gen_groups(cmp_groups): ret = [] for i in cmp_groups: lst = list(itertools.combinations(i, 2)) for ind, i in enumerate(lst, start=1): ret.append(i + (ind != len(lst),)) return ret
[docs]def plot_group( data, cmp_groups=None, cmp_groups_star=None, title=None, ax=None, box=True, show_p=True, show_ns=False, log_2=True, offset_frac=20, title_mods=None, size=4, y_max=None, p_ha='center', cmap='cool', linecolor='#000000', swarmcolor='#000000', ): ''' Plot the levels of a sequence across each group. Parameters ---------- data : :class:`pyproteome.data_sets.DataSet` cmp_groups : list of tuple, optional cmp_groups_star : list of tuple, optional title : str, optional ax : :class:`matplotlib.axes.Axes`, optional box : bool, optional show_p : bool, optional show_ns : bool, optional log_2 : bool, optional offset_frac : float, optional title_mods : list of str, optional size : float, optional y_max : float, optional p_ha : str, optional cmap : str, optional Returns ------- figs : list of :class:`matplotlib.figure.Figure` ''' if cmp_groups is None: cmp_groups = data.cmp_groups or [list(data.groups.keys())] figures = [] for _, row in data.psms.iterrows(): values = [] for groups in cmp_groups: groups = [i for i in groups if i in data.groups] group_vals = pd.Series([ row[[ data.channels[name] for name in data.groups[group] if name in data.channels ]] for group in groups ], index=groups, dtype=object) group_vals = pd.Series([ group[(~pd.isnull(group)) & (group > 0)] for group in group_vals ], index=group_vals.index, dtype=object) group_vals = group_vals[ group_vals.apply(lambda x: x.shape[0] > 0) ] # Check normalization group is not null and at least one other # group of channels is not null if ( group_vals.shape[0] < 1 # or ( # len(cmp_groups) > 1 # # and groups[0] not in group_vals.index # ) # or all([ # group not in group_vals.index # for group in groups[1:] # ]) ): continue normalize = group_vals.iloc[0].median() group_vals = pd.Series([ group / normalize for group in group_vals ], index=group_vals.index, dtype=object) values.append(group_vals) labels = [ name for group in values for name in group.index ] if ax is None: _, plot_ax = plt.subplots( figsize=(len(labels) * .75, 4), ) else: plot_ax = ax x = [ ind for ind, l in enumerate( j for i in values for j in i.values ) for k in l ] y = np.concatenate([ j.astype(float) for i in values for j in i.values ]) if log_2: y = np.log2(y) def _get_color(label): lst = [ i for i in cmp_groups for ind, val in enumerate(i) if val == label ][0] return sns.color_palette( cmap, len(lst), ).as_hex()[lst.index(label)] df = pd.DataFrame( [ ( k, np.log2(k), label, ) for i in values for label, j in i.iteritems() for k in j.values ], columns=('y', 'log2_y', 'label'), ) if box: kwargs = {'showfliers': False} else: kwargs = {} (sns.boxplot if box else sns.barplot)( x='label', y='log2_y' if log_2 else 'y', hue='label', palette={label: _get_color(label) for label in df['label']}, data=df, ax=plot_ax, dodge=False, linewidth=.75 if box else 0, **kwargs ) sns.swarmplot( x=x, y=y, color=swarmcolor, ax=plot_ax, size=size, ) plot_ax.axhline( np.log2(1) if log_2 else 1, linestyle='--', color='#4C4D4F', ) mod_str = row['Modifications'].get_mods(title_mods).__str__(prot_index=0) plot_ax.set_title( title if title else '{}{}({}{})'.format( row['Sequence'], ' ' if len(plot_ax.get_xticklabels()) > 2 else '\n', pyp.utils.get_name(row['Proteins'])[:20], (' ' + mod_str) if mod_str else '', ), ) plot_ax.xaxis.grid(False) if show_p: if y_max is None: y_max_cp = y.max() else: y_max_cp = y_max v = [ vals for group_vals in values for vals in group_vals ] cmp_star = cmp_groups_star if cmp_star is None: cmp_star = _gen_groups(cmp_groups) offset = y_max_cp / offset_frac / 2 for x_ind, x in enumerate(cmp_star): move_offset = True if len(x) == 4: group_a, group_b, move_offset, p_ha = x elif len(x) == 3: group_a, group_b, move_offset = x else: group_a, group_b = x if group_a not in labels or group_b not in labels: continue index_a = labels.index(group_a) index_b = labels.index(group_b) values_a = v[index_a] values_b = v[index_b] if values_a.shape[0] < 2 or values_b.shape[0] < 2: continue pval = ttest_ind( values_a.values, values_b.values, ).pvalue txt = pyp.utils.stars(pval) if txt != 'ns' or show_ns: plot_ax.annotate( '', xy=( index_a, y_max_cp + offset, ), xytext=( index_b, y_max_cp + offset, ), xycoords='data', textcoords='data', arrowprops=dict(arrowstyle='-', ec=linecolor), ) p_x = { 'left': index_a, 'center': np.mean([index_a, index_b]), 'right': index_b, }.get(p_ha, index_b) plot_ax.text( x=p_x, y=y_max_cp + offset + y_max_cp / offset_frac / 4 * (2 if txt == 'ns' else .75), s=txt, ha='center', va='center', ) if move_offset and x_ind != len(cmp_star) - 1: offset += y_max_cp / offset_frac else: y_max_cp, offset = 0, 0 plot_ax.set_ylim( bottom=plot_ax.get_ylim()[0], top=max([ plot_ax.get_ylim()[1], y_max_cp + offset + y_max_cp / offset_frac, ]), ) plot_ax.yaxis.set_ticks_position('left') plot_ax.xaxis.set_ticks_position('bottom') plot_ax.set_xlabel('') plot_ax.set_ylabel( '{} Signal'.format( 'Relative' if cmp_groups else 'Cumulative', ), ) if plot_ax.get_legend(): plot_ax.get_legend().set_visible(False) if log_2: plot_ax.set_yticks([ i for i in plot_ax.get_yticks() if i >= plot_ax.get_ylim()[0] and i <= plot_ax.get_ylim()[1] ]) plot_ax.set_yticklabels( [ '{:.2f}'.format(np.power(2, i)) for i in plot_ax.get_yticks() ], ) plot_ax.set_xticklabels( labels, rotation=45, horizontalalignment='right', ) figures.append((plot_ax.get_figure(), plot_ax)) return figures
[docs]def plot_together( data, cmp_groups=None, title=None, ax=None, show_p=True, log_2=True, cmap='cool', ): ''' Plot the levels of a sequence across each group in one shared plot. Parameters ---------- data : :class:`pyproteome.data_sets.DataSet` cmp_groups : list of tuple, optional title : str, optional ax : :class:`matplotlib.axes.Axes`, optional show_p : bool, optional log_2 : bool, optional cmap : str, optional Returns ------- figs : list of :class:`matplotlib.figure.Figure` ''' if cmp_groups is None: cmp_groups = data.cmp_groups or [list(data.groups.keys())] figures = [] for _, row in data.psms.iterrows(): values = [] for groups in cmp_groups: groups = [i for i in groups if i in data.groups] group_vals = pd.Series([ row[[ data.channels[name] for name in data.groups[group] if name in data.channels ]] for group in groups ], index=groups, dtype=object) group_vals = pd.Series([ group[~pd.isnull(group)] for group in group_vals ], index=group_vals.index, dtype=object) group_vals = group_vals[ group_vals.apply(lambda x: x.shape[0] > 0) ] # Check normalization group is not null and at least one other # group of channels is not null if ( group_vals.shape[0] < 1 or ( len(cmp_groups) > 1 and groups[0] not in group_vals.index ) or all( group not in group_vals.index for group in groups[1:] ) ): continue normalize = group_vals.iloc[0].median() group_vals = pd.Series([ group / normalize for group in group_vals ], index=group_vals.index, dtype=object) values.append(group_vals) labels = [ name for group in values for name in group.index ] if ax is None: _, plot_ax = plt.subplots( figsize=(len(labels) * .75, 4), ) else: plot_ax = ax x = [ ind for ind, l in enumerate( j for i in values for j in i.values ) for k in l ] y = np.concatenate([ j.astype(float) for i in values for j in i.values ]) if log_2: y = np.log2(y) def _get_color(label): lst = [ i for i in cmp_groups for ind, val in enumerate(i) if val == label ][0] return sns.color_palette( cmap, len(lst), ).as_hex()[lst.index(label)] df = pd.DataFrame( [ ( k, np.log2(k), label, _get_color(label), ) for i in values for label, j in i.iteritems() for k in j.values ], columns=('y', 'log2_y', 'label', 'color'), ) sns.boxplot( x='label', y='log2_y' if log_2 else 'y', hue='color', data=df, ax=plot_ax, dodge=False, boxprops=dict(alpha=.3), ) sns.swarmplot( x=x, y=y, color='k', ax=plot_ax, # size=10, ) plot_ax.axhline( np.log2(1) if log_2 else 1, linestyle='--', color='#4C4D4F', ) if title: plot_ax.set_title(title) plot_ax.xaxis.grid(False) if show_p: y_max = y.max() v = [ vals for group_vals in values for vals in group_vals ] for grp_set in cmp_groups: offset = y_max / 10 for label_a, label_b in itertools.combinations(grp_set, 2): if label_a not in labels or label_b not in labels: continue index_a = labels.index(label_a) index_b = labels.index(label_b) values_a = v[index_a] values_b = v[index_b] if values_a.shape[0] < 2 or values_b.shape[0] < 2: continue pval = ttest_ind( values_a.values, values_b.values, ).pvalue if pval < 0.05: plot_ax.annotate( '', xy=( index_a, y_max + offset, ), xytext=( index_b, y_max + offset, ), xycoords='data', textcoords='data', arrowprops=dict( arrowstyle='-', ec='#000000', ), ) plot_ax.text( x=np.mean([index_a, index_b]), y=y_max + offset + y_max / 40, s=pyp.utils.stars(pval, ns='-'), horizontalalignment='center', verticalalignment='center', ) offset += y_max / 10 plot_ax.set_ylim( bottom=plot_ax.get_ylim()[0], top=max([ plot_ax.get_ylim()[1], y_max + offset + y_max / 10, ]), ) plot_ax.set_xlabel('') plot_ax.set_ylabel( '{} Signal'.format( 'Relative' if cmp_groups else 'Cumulative', ), ) plot_ax.get_legend().set_visible(False) if log_2: plot_ax.set_yticklabels( [ '{:.2f}'.format(i) for i in np.power(2, plot_ax.get_yticks()) ], ) plot_ax.set_xticklabels( labels, rotation=45, horizontalalignment='right', ) figures.append((plot_ax.get_figure(), plot_ax)) return figures
[docs]def plot_all( data, cmp_groups=None, ): ''' Runs :func:`.plot` and :func:`.plot_group` for all peptides in a data set. Parameters ---------- data : :class:`pyproteome.data_sets.DataSet` cmp_groups : list of tuple, optional Returns ------- figs : list of :class:`matplotlib.figure.Figure` ''' figures = [] figures += plot( data, ) figures += plot_group( data, cmp_groups=cmp_groups, ) return figures