Source code for pyproteome.pathways.enrichments

# -*- coding: utf-8 -*-
'''
This module does most of the heavy lifting for the pathways module.

It includes functions for calculating enrichment scores and generating plots
for GSEA / PSEA.
'''
from __future__ import division

from collections import defaultdict
from functools import partial
import logging
import multiprocessing

import numpy as np
import pandas as pd
from sklearn.utils import shuffle

from . import plot


LOGGER = logging.getLogger('pyproteome.enrichments')
MIN_PERIODS = 5
'''
Minimum number of samples with peptide quantification and phenotypic
measurements needed to generate a correlation metric score.
'''
CORRELATION_METRICS = [
    'spearman',
    'pearson',
    'kendall',
    'fold',
    'log2',
    'zscore',
]
'''
Correlation metrics used for enrichment analysis. 'spearman', 'pearson', and
'kendall' are all calculated using `pandas.Series.corr()`.

'fold' takes ranking values direction from the 'Fold Change' column.

'log2' takes ranking values from a log2 'Fold Change' column.

'zscore' takes ranking values from a log2 z-scored 'Fold Change' column.
'''
DEFAULT_P = 0.75
DEFAULT_ESS_METHOD = 'integral'
DEFAULT_Q = 0.25
DEFAULT_RANK_CPUS = 6
'''
Default number of CPUs to use when scrambling rows of a data set.
'''
DEFAULT_CORR_CPUS = 4
'''
Default number of CPUs to use when scrambling columns of a data set.
'''
ESS_METHODS = {
    'max_abs': lambda x: max(x, key=abs),
    'max_min': lambda x: (max(x) - min(x)) * np.sign(max(x, key=abs)),
    'integral': lambda x: np.trapz(x),
}


[docs]class PrPDF(object): ''' An exact probability distribution estimator. ''' def __init__(self, data): self.data = np.sort(data)
[docs] def pdf(self, x): ''' Probability density function. Parameters ---------- x : float Returns ------- float ''' return 1 / self.data.shape[0]
[docs] def cdf(self, x): ''' Cumulative density function. Parameters ---------- x : float Returns ------- float ''' return np.searchsorted(self.data, x, side='left') / self.data.shape[0]
[docs] def sf(self, x): ''' Survival function. Parameters ---------- x : float Returns ------- float ''' return 1 - ( np.searchsorted(self.data, x, side='right') / self.data.shape[0] )
def _calc_p(ess, ess_pi): return [ abs(ess) <= abs(i) for i in ess_pi if (i < 0) == (ess < 0) ] def _shuffle(ser): ind = ser.index ser = shuffle(ser) ser.index = ind return ser
[docs]def simulate_es_s_pi( vals, psms, gene_sets, phenotype=None, metric='spearman', p_iter=1000, n_cpus=None, **kwargs ): ''' Simulate ES(S, pi) by scrambling the phenotype / correlation values for a data set and recalculating gene set enrichment scores. Parameters ---------- vals : :class:`pandas.DataFrame` psms : :class:`pandas.DataFrame` gene_sets : :class:`pandas.DataFrame` phenotype : :class:`pandas.Series`, optional metric : str, optional p_iter : int, optional n_cpus : int, optional Returns ------- df : :class:`pandas.DataFrame` ''' assert metric in CORRELATION_METRICS LOGGER.info( 'Calculating ES(S, pi) for {} gene sets'.format(len(gene_sets)) ) if n_cpus is None: n_cpus = DEFAULT_RANK_CPUS if metric in ['spearman', 'pearson', 'kendall']: n_cpus = DEFAULT_CORR_CPUS if n_cpus > 1: pool = multiprocessing.Pool( processes=n_cpus, ) gen = pool.imap_unordered( partial( _calc_essdist, psms=psms.copy(), gene_sets=gene_sets, metric=metric, **kwargs ), [phenotype for _ in range(p_iter)], ) else: gen = ( _calc_essdist( phen=phenotype, psms=psms, gene_sets=gene_sets, metric=metric, **kwargs ) for _ in range(p_iter) ) ess_dist = defaultdict(list) for ind, ess in enumerate( gen, start=1, ): for key, val in ess.items(): ess_dist[key].append(val) if ind % (p_iter // min([p_iter, 10])) == 0: LOGGER.info( '-- Calculated {}/{} pvals'.format(ind, p_iter) ) LOGGER.info('Calculating ES(S, pi) using {} cpus'.format(n_cpus)) vals['ES(S, pi)'] = vals.index.map( lambda row: ess_dist[row], ) return vals
def _calc_q(nes, nes_pdf, nes_pi_pdf): if nes > 0: return ( nes_pi_pdf.pdf(nes) + nes_pi_pdf.sf(nes) ) / ( nes_pdf.pdf(nes) + nes_pdf.sf(nes) ) else: return ( nes_pi_pdf.pdf(nes) + nes_pi_pdf.cdf(nes) ) / ( nes_pdf.pdf(nes) + nes_pdf.cdf(nes) )
[docs]def estimate_pq(vals): ''' Estimate p- and q-values for an enrichment analysis using the ES(S, pi) values generated by `simulate_es_s_pi`. Parameters ---------- vals : :class:`pandas.DataFrame` ''' assert 'ES(S)' in vals.columns assert 'ES(S, pi)' in vals.columns vals = vals.copy() mask = vals['ES(S)'] > 0 pos_ess = vals['ES(S)'][mask] neg_ess = vals['ES(S)'][~mask] pos_pi = vals['ES(S, pi)'].apply(np.array).apply(lambda x: x[x > 0]) neg_pi = vals['ES(S, pi)'].apply(np.array).apply(lambda x: x[x < 0]) pos_mean = pos_pi.apply(np.mean).abs() neg_mean = neg_pi.apply(np.mean).abs() pos_nes = pos_ess / pos_mean neg_nes = neg_ess / neg_mean # assert (pos_nes.isnull() | neg_nes.isnull()).all() # assert ((~pos_nes.isnull()) | (~neg_nes.isnull())).all() pos_pi_nes = pos_pi / pos_mean neg_pi_nes = neg_pi / neg_mean vals['NES(S)'] = pos_nes.fillna(neg_nes) vals['pos NES(S, pi)'] = pos_pi_nes vals['neg NES(S, pi)'] = neg_pi_nes pos_mat = ( np.concatenate(pos_pi_nes.values) if pos_pi_nes.shape[0] > 0 else np.array([]) ) neg_mat = ( np.concatenate(neg_pi_nes.values) if neg_pi_nes.shape[0] > 0 else np.array([]) ) plot.plot_nes_dist( vals['NES(S)'].values, np.concatenate([pos_mat, neg_mat]), ) pos_pdf = PrPDF(pos_nes.dropna().values) neg_pdf = PrPDF(neg_nes.dropna().values) pos_pi_pdf = PrPDF(pos_mat) neg_pi_pdf = PrPDF(neg_mat) LOGGER.info('Generated NES(S) distributions') if vals.shape[0] > 0: vals['p-value'] = vals.apply( lambda x: _frac_true( _calc_p(x['ES(S)'], x['ES(S, pi)']) ), axis=1, ) vals['q-value'] = vals.apply( lambda x: _calc_q( x['NES(S)'], pos_pdf if x['NES(S)'] > 0 else neg_pdf, pos_pi_pdf if x['NES(S)'] > 0 else neg_pi_pdf, ), axis=1, ) LOGGER.info('Calculated p, q values') return vals
def _frac_true(x): return sum(x) / max([len(x), 1])
[docs]def get_gene_changes(psms): ''' Extract the gene IDs and correlation values for each gene / phosphosite in a data set. Merge together duplicate IDs by calculating their mean correlation. Parameters ---------- psms : :class:`pandas.DataFrame` ''' LOGGER.info('Getting gene correlations ({} IDs)'.format(psms.shape[0])) gene_changes = psms[['ID', 'Correlation']].copy() if gene_changes.shape[0] > 0: gene_changes = gene_changes.groupby( 'ID', as_index=False, ).agg({ 'Correlation': np.mean, }) gene_changes = gene_changes.sort_values(by='Correlation', ascending=False) gene_changes = gene_changes.set_index(keys='ID') return gene_changes
def _calc_essdist( phen=None, psms=None, gene_sets=None, metric='spearman', **kwargs ): assert metric in CORRELATION_METRICS if phen is not None: phen = _shuffle(phen) if metric in ['fold', 'zscore']: psms = psms.copy() psms['Fold Change'] = _shuffle(psms['Fold Change']) vals = enrichment_scores( psms, gene_sets, phenotype=phen, metric=metric, recorrelate=True, pval=False, **kwargs ) return vals['ES(S)']
[docs]def correlate_phenotype(psms, phenotype=None, metric='spearman'): ''' Calculate the correlation values for each gene / phosphosite in a data set. Parameters ---------- psms : :class:`pandas.DataFrame` phenotype : :class:`pandas.Series`, optional metric : str, optional The correlation function to use. See CORRELATION_METRICS for a full list of choices. Returns ------- psms : :class:`pandas.DataFrame` ''' assert metric in CORRELATION_METRICS psms = psms.copy() if metric in ['spearman', 'pearson', 'kendall']: LOGGER.info( 'Calculating correlations using metric \'{}\' (samples: {})' .format(metric, list(phenotype.index)) ) psms['Correlation'] = psms.apply( lambda row: phenotype.corr( pd.to_numeric(row[phenotype.index]), method=metric, min_periods=MIN_PERIODS, ), axis=1, ) else: LOGGER.info( 'Calculating ranks' ) new = psms['Fold Change'] if ( metric in ['log2'] or (metric in ['zscore'] and (new > 0).all()) ): new = new.apply(np.log2) if metric in ['zscore']: new = (new - new.mean()) / new.std() psms['Correlation'] = new return psms
[docs]def calculate_es_s(gene_changes, gene_set, p=None, n_h=None, ess_method=None): ''' Calculate the enrichment score for an individual gene set. Parameters ---------- gene_changes : :class:`pandas.DataFrame` gene_set : set of str p : float, optional n_h : int, optional ess_method : str, optional One of {'integral', 'max_abs', 'max_min'}. Returns ------- dict ''' if p is None: p = DEFAULT_P if ess_method is None: ess_method = DEFAULT_ESS_METHOD assert ess_method in ESS_METHODS n = len(gene_changes) gene_set = set( gene for gene in gene_set if gene in gene_changes.index ) hits = gene_changes.index.isin(gene_set) hit_list = gene_changes[hits].index.tolist() if n_h is None: n_h = len(gene_set) n_r = gene_changes[hits]['Correlation'].apply( lambda x: abs(x) ** p ).sum() scores = [0] + [ ((abs(val) ** p) / n_r) if hit else (-1 / (n - n_h)) for hit, val in zip(hits, gene_changes['Correlation']) ] cumsum = np.cumsum(scores) ess = ESS_METHODS.get(ess_method)(cumsum) return { 'hits': hits, 'cumscore': cumsum, 'ess': ess, 'hit_list': hit_list, }
[docs]def calculate_es_s_ud(gene_changes, up_set, down_set, **kwargs): ''' Calculate the enrichment score for an individual gene set. Parameters ---------- gene_changes : :class:`pandas.DataFrame` gene_set : set of str kwargs : dict, optional See extra arguments passed to calculate_es_s. Returns ------- dict ''' up_set = set( gene for gene in up_set if gene in gene_changes.index ) down_set = set( gene for gene in down_set if gene in gene_changes.index ) vals = [] for gene_set in [up_set, down_set]: vals.append( calculate_es_s( gene_changes, gene_set, n_h=len(up_set) + len(down_set), **kwargs ) ) up_hits = vals[0]['hits'] down_hits = vals[1]['hits'] hit_list = vals[0]['hit_list'] + vals[1]['hit_list'] upcumsum = vals[0]['cumscore'] downcumsum = vals[1]['cumscore'] ess = ( vals[0]['ess'] if vals[0]['hits'].any() else 0 ) - ( vals[1]['ess'] if vals[1]['hits'].any() else 0 ) return { 'hits': up_hits, 'down_hits': down_hits, 'cumscore': upcumsum, 'down_cumscore': downcumsum, 'ess': ess, 'hit_list': hit_list, }
def _get_set_cols(cols): for set_cols in [ ['up_set', 'down_set'], ['set'], ]: if any([i in cols for i in set_cols]): return set_cols
[docs]def enrichment_scores( psms, gene_sets, pval=True, recorrelate=False, metric=None, phenotype=None, **kwargs ): ''' Calculate enrichment scores for each gene set. p-values and q-values are calculated by scrambling the phenotypes assigned to each sample or scrambling peptides' fold changes, depending on the correlation metric used. Parameters ---------- psms : :class:`pandas.DataFrame` gene_sets : :class:`pandas.DataFrame` pval : bool, optional recorrelate : bool, optional metric : str, optional phenotype : :class:`pandas.Series`, optional kwargs : dict, optional See extra arguments passed to `calculate_es_s` and `simulate_es_s_pi`. Returns ------- df : :class:`pandas.DataFrame` ''' if metric is None: if phenotype is None: metric = 'fold' else: metric = 'spearman' assert metric in CORRELATION_METRICS if recorrelate: psms = correlate_phenotype(psms, phenotype=phenotype, metric=metric) gene_changes = get_gene_changes(psms) set_cols = _get_set_cols(gene_sets.columns) cols = [ 'name', 'cumscore', 'down_cumscore', 'ES(S)', 'hits', 'down_hits', 'hit_list', 'n_hits', ] + set_cols vals = pd.DataFrame( columns=cols, ) ess_args = { i: kwargs[i] for i in ['p', 'ess_method'] if i in kwargs } for set_id, row in gene_sets.iterrows(): if set(set_cols) == set(['set']): es_vals = calculate_es_s( gene_changes, row['set'], **ess_args ) else: es_vals = calculate_es_s_ud( gene_changes, row['up_set'], row['down_set'], **ess_args ) vals = vals.append( pd.Series( [ row['name'], es_vals.get('cumscore', []), es_vals.get('down_cumscore', []), es_vals.get('ess', 0), es_vals.get('hits', []), es_vals.get('down_hits', []), es_vals.get('hit_list', []), len(es_vals.get('hit_list', [])), ] + [row[i] for i in set_cols], name=set_id, index=cols, ) ) ess_pi_args = { i: kwargs[i] for i in ['p', 'p_iter', 'n_cpus', 'ess_method'] if i in kwargs } if pval: vals = simulate_es_s_pi( vals, psms, gene_sets, phenotype=phenotype, metric=metric, **ess_pi_args ) vals = estimate_pq(vals) vals = vals.sort_values('NES(S)', ascending=False) else: vals = vals.sort_values('ES(S)', ascending=False) return vals
[docs]def filter_gene_sets(gene_sets, psms, min_hits=10): ''' Filter gene sets to include only those with at least a given number of hits in a data set. Parameters ---------- gene_sets : :class:`pandas.DataFrame` psms : :class:`pandas.DataFrame` min_hits : int, optional Returns ------- df : :class:`pandas.DataFrame` ''' LOGGER.info('Filtering gene sets') total_sets = gene_sets all_genes = set(psms['ID']) set_cols = _get_set_cols(gene_sets.columns) combined_set = gene_sets.apply( lambda x: set( i for col in set_cols for i in x[col] ), axis=1, ) gene_sets = gene_sets[ combined_set.apply( lambda x: len(x) < len(all_genes) and len(all_genes.intersection(x)) >= min_hits ) ] LOGGER.info( 'Filtered {} gene sets down to {} with ≥ {} genes present' .format(total_sets.shape[0], gene_sets.shape[0], min_hits) ) return gene_sets
[docs]def filter_vals( vals, min_hits=0, min_abs_score=0, max_pval=1, max_qval=1, ): ''' Filter gene set enrichment scores using give ES(S) / p-value / q-value cutoffs. Parameters ---------- vals : :class:`pandas.DataFrame` min_hits : int, optional min_abs_score : float, optional max_pval : float, optional max_qval : float, optional Returns ------- df : :class:`pandas.DataFrame` ''' n_len = len(vals) if vals.shape[0] > 0: vals = vals[ vals.apply( lambda x: x['n_hits'] >= min_hits and abs(x['ES(S)']) >= min_abs_score and ( (x['p-value'] <= max_pval) if 'p-value' in x.index else True ) and ( (x['q-value'] <= max_qval) if 'q-value' in x.index else True ), axis=1, ) ] LOGGER.info( 'Filtered {} gene sets down to {} after cutoffs' .format(n_len, len(vals)) ) return vals