Source code for pyproteome.cluster.plot


from __future__ import absolute_import, division

from functools import cmp_to_key
import logging
from math import ceil, sqrt

from fastcluster import linkage
from matplotlib import pyplot as plt
import numpy as np
from scipy.cluster.hierarchy import dendrogram
import sklearn

import pyproteome as pyp

from . import clusterer

LOGGER = logging.getLogger('pyp.cluster.plot')
COLOR_MAP = plt.cm.rainbow
CORR_COLOR_MAP = plt.cm.coolwarm


[docs]def hierarchical_clusters(data, y_pred): ds = data['ds'] z = data['z'] ss = sorted( set(y_pred), key=cmp_to_key ( lambda x, y: np.corrcoef( ds[y_pred == x].mean(axis=0), ds[y_pred == y].mean(axis=0), )[0, 1] ) ) mapping = { val: key for key, val in enumerate(ss) } link = linkage( np.array([ z[y_pred.values == x].mean(axis=0) for x in ss ]) ) leaves = dendrogram( link, labels=[str(x) for x in ss], no_plot=True, )['leaves'] mapping = { val: key for key, val in enumerate(leaves) } new_ind = np.argsort(np.vectorize(lambda x: mapping[x])(y_pred)) cn = np.corrcoef(data['data'].values[new_ind]) cmapping = { cluster_n: index for index, cluster_n in enumerate( sorted(mapping, key=lambda x: mapping[x]) ) } return mapping, new_ind, cmapping, cn
[docs]def cluster_corrmap( data, y_pred, colorbar=True, f=None, ax=None, div_scale=None, show_names=None, ): if ax is None: f, ax = plt.subplots(figsize=(13, 12)) elif f is None: f = ax.get_figure() if div_scale is None: div_scale = max([data['data'].shape[0] // 1000, 1]) if len(set(y_pred)) > 1: mapping, new_ind, _, cn = hierarchical_clusters(data, y_pred) sorted_clusters = sorted(mapping, key=lambda x: mapping[x]) y_pred = y_pred.values[new_ind] else: sorted_clusters = sorted(set(y_pred)) cn = data['c'] if div_scale > 1: cn = cn[::div_scale, ::div_scale] mesh = ax.pcolormesh( cn, cmap=CORR_COLOR_MAP, vmin=-1, vmax=1, ) if show_names: sorted_clusters = [ i for i in sorted_clusters if i in show_names ] for cluster_n in sorted_clusters: ind = np.arange( 0, data['data'].shape[0], )[y_pred == cluster_n] xy = np.median(ind) / div_scale if show_names and isinstance(show_names, dict): cluster_n = show_names.get(cluster_n) ax.text( s=str(cluster_n), x=xy, y=xy, # fontsize=sqrt(ind.shape[0]), color='k', ha='center', va='center', ) if colorbar: f.colorbar(mesh) return f
[docs]def plot_cluster( data, y_pred, cluster_n, f=None, ax=None, div_scale=None, ylabel=True, title=None, color=None, ): if ax is None: f, ax = plt.subplots( dpi=pyp.DEFAULT_DPI, figsize=(8, 4), ) elif f is None: f = ax.get_figure() z = data['z'] names = data['names'] dad = z[y_pred.values == cluster_n] n = dad.shape[0] if div_scale is None: div_scale = max([dad.shape[0] // 100, 1]) if div_scale > 1: dad = dad[::div_scale] prev_ind = 0 means = np.mean(dad, axis=0) for ind, v in enumerate(data['classes']): n_v, p_v = None, None if ind > 0: p_v = data['classes'][ind - 1] if ind + 1 < len(data['classes']): n_v = data['classes'][ind + 1] if v != p_v and p_v is not None: ax.axvline( x=ind - .5, color='k', linestyle=':', ) if v == n_v: continue ax.plot( np.tile( np.arange(prev_ind, ind + 1), (dad.shape[0], 1), ).T, dad[:, prev_ind:ind + 1].T, alpha=min([5 / dad.shape[0], 1]), color=color or COLOR_MAP(cluster_n / len(set(y_pred))), ) ax.plot( range(prev_ind, ind + 1), means[prev_ind:ind + 1], color='k', ) prev_ind = ind + 1 if ylabel: ax.set_ylabel('Z-scored Change') ax.set_facecolor((.9,) * 3) ax.set_xticks(range(dad.shape[1])) ax.set_xticklabels(names, rotation=45, horizontalalignment='right') ax.set_title( 'Cluster #{}, N={}'.format(cluster_n, n) if title is None else title ) return ax
[docs]def plot_all_clusters( data, y_pred, cols=4, ): ss = sorted(set(y_pred)) rows = int(np.ceil(len(ss) / cols)) f, axes = plt.subplots( rows, cols, figsize=(cols * 3, rows * 3), sharey=True, sharex=True, ) ax_iter = iter(axes.ravel()) for ind, (cluster_n, ax) in enumerate(zip(ss, ax_iter)): plot_cluster( data, y_pred, cluster_n, ax=ax, ylabel=ind % cols == 0, ) for ax in ax_iter: ax.axis('off') f.tight_layout(h_pad=1) return f
[docs]def show_cluster( data, y_pred, seq=None, protein=None, ylabel=True, f=None, ax=None, color=None, div_scale=None, ): if ax is None: f, ax = plt.subplots(figsize=(6, 6)) elif f is None: f = ax.get_figure() ds = data['ds'] z = data['z'] dp = ds.copy() mod = '' mask = None if seq is not None: mask = dp['Sequence'] == seq mod = dp[mask].iloc[0]['Modifications'] protein = ' / '.join(dp.filter(series=mask).genes) cluster = y_pred[mask].iloc[0] elif protein is not None: mask = dp['Proteins'] == protein mod = dp[mask].iloc[0]['Modifications'] protein = ' / '.join(dp.filter(series=mask).genes) cluster = y_pred[mask].iloc[0] mod_str = mod.get_mods([(None, 'Phospho')]).__str__(prot_index=0) title = '{}{}'.format( (protein[:15] + ' ...') if len(protein) > 15 else protein, ' {}'.format(mod_str) if mod_str else '', ) plot_cluster( data, y_pred, cluster, ax=ax, color=color, div_scale=div_scale, ylabel=ylabel, title=title, ) if mask is not None: z_m = z[mask] prev_ind = 0 for ind, v in enumerate(data['classes']): n_v = None if ind + 1 < len(data['classes']): n_v = data['classes'][ind + 1] if v == n_v: continue ax.plot( np.tile( np.arange(prev_ind, ind + 1), (z_m.shape[0], 1), ).T, z_m[:, prev_ind:ind + 1].T, color='k', linestyle=':', linewidth=5, ) prev_ind = ind + 1 dp.psms = dp[y_pred == cluster] return dp, f, ax
[docs]def show_peptide_clusters( data, y_pred, filters, new_colors=False, div_scale=None, cols=4, ): rows = int(ceil(len(filters) / cols)) f, axes = plt.subplots( rows, cols, figsize=(cols * 2, rows * 2), sharex=True, sharey=True, ) ax_iter = iter(axes.ravel()) clusters = [ sorted( set( y_pred[ data['ds']['Sequence'] == fil['seq'] if 'seq' in fil else data['ds']['Proteins'] == fil['protein'] ] ) ) for fil in filters ] clusters = [ i[0] for i in clusters if i ] ss = sorted(set(clusters)) colors = [ COLOR_MAP(ss.index(cluster_n) / len(ss)) if new_colors else COLOR_MAP(cluster_n / len(set(y_pred))) for cluster_n in clusters ] for ind, (fil, color, ax) in enumerate(zip(filters, colors, ax_iter)): show_cluster( data, y_pred, ax=ax, ylabel=ind % cols == 0, color=color, div_scale=div_scale, save=False, **fil ) for ax in ax_iter: ax.axis('off') return f, axes
[docs]def pca(data): classes = data['classes'] z = data['z'] names = data['names'] f, ax = plt.subplots(1, 1, figsize=(4, 4)) x = sklearn.decomposition.PCA().fit_transform(z.T) for ind, label in enumerate(data['labels']): ax.scatter( x[classes == ind, 0], x[classes == ind, 1], label=label, ) offset = 0 for ind, column in enumerate(names): ax.text( x=x[ind, 0] + offset, y=x[ind, 1] + offset, s=column, ) ax.legend() ax.set_title('PCA {}'.format(data['ds'].name)) ax.set_xlabel('Component 1') ax.set_ylabel('Component 2') return f, ax
[docs]def cluster_range( data, min_clusters=2, max_clusters=20, cols=3, ): clusters = range(min_clusters, max_clusters + 1) rows = int(np.ceil(len(clusters) / cols)) f, axes = plt.subplots( rows, cols, figsize=(4 * cols, 4 * rows), ) for n, ax in zip(clusters, axes.ravel()): LOGGER.info('Generating plots for cluster {}'.format(n)) _, y_pred = clusterer.cluster( data, n_clusters=n, ) pyp.cluster.plot.cluster_corrmap( data, y_pred, ax=ax, colorbar=False, ) ax.set_title('{} Clusters'.format(n)) for ax in axes.ravel()[len(clusters):]: ax.axis('off') return f, axes