Source code for pyproteome.cluster.clusterer


from scipy.stats import zscore

import numpy as np
import pandas as pd
import sklearn
import sklearn.decomposition
import sklearn.cluster

import pyproteome as pyp


[docs]def get_data(ds, dropna=True, corrcoef=True, groups=None): ''' Extract the exact data matrix that will be used for clustering Parameters ---------- ds : :class:`pyproteome.data_sets.DataSet` dropna : bool, optional corrcoef : bool, optional groups : list of str, optional Returns ------- dict ''' ds = ds.copy() if groups is None: groups = ds.cmp_groups or [list(ds.groups.keys())] if dropna: ds = ds.dropna(how='any', groups=pyp.utils.flatten_list(groups)) names = [ chan for lst in groups for group in lst for chan in ds.groups[group] if chan in ds.channels ] chans = [ds.channels[chan] for chan in names] data = ds.data[chans] if corrcoef: c = np.corrcoef(data.values) z = zscore(data, axis=1) classes = np.array([ [ lst.index(i) for lst in groups for i in lst if col in ds.groups[i] ][0] for col in names ]) return { 'ds': ds, 'data': data, 'z': z, 'c': c if corrcoef else None, 'names': names, 'labels': groups, 'classes': classes, }
[docs]def cluster(data, z=False, log2=True, clr=None, n_clusters=20): ''' Cluster a data set using scikit-learn. Parameters ---------- data : dict Object returned from :func:`.get_data`. z : float, optional log2 : float, optional clr : object, optional Clusterer object, by default use :class:`sklearn.cluster.MiniBatchKMeans`. n_clusters : int, optional Returns ------- clr : :class:`sklearn.base.ClusterMixin` y_pred : :class:`pandas.Series` of int ''' if z: x = data['z'] else: if log2: x = data['ds'].data.applymap(np.log2) else: x = data['ds'].data if clr is None: clr = sklearn.cluster.MiniBatchKMeans(n_clusters=n_clusters) y_pred_spec = clr.fit_predict(x) y_pred = pd.Series( [ i if i >= 0 else max(y_pred_spec) + 1 for i in y_pred_spec ], index=data['ds'].psms.index, ) return clr, y_pred