# -*- coding: utf-8 -*-
'''
This module does most of the heavy lifting for the pathways module.
It includes functions for calculating enrichment scores and generating plots
for GSEA / PSEA.
'''
from __future__ import division
from collections import defaultdict
from functools import partial
import logging
import multiprocessing
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from . import plot
LOGGER = logging.getLogger('pyproteome.enrichments')
MIN_PERIODS = 5
'''
Minimum number of samples with peptide quantification and phenotypic
measurements needed to generate a correlation metric score.
'''
CORRELATION_METRICS = [
'spearman',
'pearson',
'kendall',
'fold',
'log2',
'zscore',
]
'''
Correlation metrics used for enrichment analysis. 'spearman', 'pearson', and
'kendall' are all calculated using `pandas.Series.corr()`.
'fold' takes ranking values direction from the 'Fold Change' column.
'log2' takes ranking values from a log2 'Fold Change' column.
'zscore' takes ranking values from a log2 z-scored 'Fold Change' column.
'''
DEFAULT_P = 0.75
DEFAULT_ESS_METHOD = 'integral'
DEFAULT_Q = 0.25
DEFAULT_RANK_CPUS = 6
'''
Default number of CPUs to use when scrambling rows of a data set.
'''
DEFAULT_CORR_CPUS = 4
'''
Default number of CPUs to use when scrambling columns of a data set.
'''
ESS_METHODS = {
'max_abs': lambda x: max(x, key=abs),
'max_min': lambda x: (max(x) - min(x)) * np.sign(max(x, key=abs)),
'integral': lambda x: np.trapz(x),
}
[docs]class PrPDF(object):
'''
An exact probability distribution estimator.
'''
def __init__(self, data):
self.data = np.sort(data)
[docs] def pdf(self, x):
'''
Probability density function.
Parameters
----------
x : float
Returns
-------
float
'''
return 1 / self.data.shape[0]
[docs] def cdf(self, x):
'''
Cumulative density function.
Parameters
----------
x : float
Returns
-------
float
'''
return np.searchsorted(self.data, x, side='left') / self.data.shape[0]
[docs] def sf(self, x):
'''
Survival function.
Parameters
----------
x : float
Returns
-------
float
'''
return 1 - (
np.searchsorted(self.data, x, side='right') / self.data.shape[0]
)
def _calc_p(ess, ess_pi):
return [
abs(ess) <= abs(i)
for i in ess_pi
if (i < 0) == (ess < 0)
]
def _shuffle(ser):
ind = ser.index
ser = shuffle(ser)
ser.index = ind
return ser
[docs]def simulate_es_s_pi(
vals,
psms,
gene_sets,
phenotype=None,
metric='spearman',
p_iter=1000,
n_cpus=None,
**kwargs
):
'''
Simulate ES(S, pi) by scrambling the phenotype / correlation values for a
data set and recalculating gene set enrichment scores.
Parameters
----------
vals : :class:`pandas.DataFrame`
psms : :class:`pandas.DataFrame`
gene_sets : :class:`pandas.DataFrame`
phenotype : :class:`pandas.Series`, optional
metric : str, optional
p_iter : int, optional
n_cpus : int, optional
Returns
-------
df : :class:`pandas.DataFrame`
'''
assert metric in CORRELATION_METRICS
LOGGER.info(
'Calculating ES(S, pi) for {} gene sets'.format(len(gene_sets))
)
if n_cpus is None:
n_cpus = DEFAULT_RANK_CPUS
if metric in ['spearman', 'pearson', 'kendall']:
n_cpus = DEFAULT_CORR_CPUS
if n_cpus > 1:
pool = multiprocessing.Pool(
processes=n_cpus,
)
gen = pool.imap_unordered(
partial(
_calc_essdist,
psms=psms.copy(),
gene_sets=gene_sets,
metric=metric,
**kwargs
),
[phenotype for _ in range(p_iter)],
)
else:
gen = (
_calc_essdist(
phen=phenotype,
psms=psms,
gene_sets=gene_sets,
metric=metric,
**kwargs
)
for _ in range(p_iter)
)
ess_dist = defaultdict(list)
for ind, ess in enumerate(
gen,
start=1,
):
for key, val in ess.items():
ess_dist[key].append(val)
if ind % (p_iter // min([p_iter, 10])) == 0:
LOGGER.info(
'-- Calculated {}/{} pvals'.format(ind, p_iter)
)
LOGGER.info('Calculating ES(S, pi) using {} cpus'.format(n_cpus))
vals['ES(S, pi)'] = vals.index.map(
lambda row: ess_dist[row],
)
return vals
def _calc_q(nes, nes_pdf, nes_pi_pdf):
if nes > 0:
return (
nes_pi_pdf.pdf(nes) + nes_pi_pdf.sf(nes)
) / (
nes_pdf.pdf(nes) + nes_pdf.sf(nes)
)
else:
return (
nes_pi_pdf.pdf(nes) + nes_pi_pdf.cdf(nes)
) / (
nes_pdf.pdf(nes) + nes_pdf.cdf(nes)
)
[docs]def estimate_pq(vals):
'''
Estimate p- and q-values for an enrichment analysis using the ES(S, pi)
values generated by `simulate_es_s_pi`.
Parameters
----------
vals : :class:`pandas.DataFrame`
'''
assert 'ES(S)' in vals.columns
assert 'ES(S, pi)' in vals.columns
vals = vals.copy()
mask = vals['ES(S)'] > 0
pos_ess = vals['ES(S)'][mask]
neg_ess = vals['ES(S)'][~mask]
pos_pi = vals['ES(S, pi)'].apply(np.array).apply(lambda x: x[x > 0])
neg_pi = vals['ES(S, pi)'].apply(np.array).apply(lambda x: x[x < 0])
pos_mean = pos_pi.apply(np.mean).abs()
neg_mean = neg_pi.apply(np.mean).abs()
pos_nes = pos_ess / pos_mean
neg_nes = neg_ess / neg_mean
# assert (pos_nes.isnull() | neg_nes.isnull()).all()
# assert ((~pos_nes.isnull()) | (~neg_nes.isnull())).all()
pos_pi_nes = pos_pi / pos_mean
neg_pi_nes = neg_pi / neg_mean
vals['NES(S)'] = pos_nes.fillna(neg_nes)
vals['pos NES(S, pi)'] = pos_pi_nes
vals['neg NES(S, pi)'] = neg_pi_nes
pos_mat = (
np.concatenate(pos_pi_nes.values)
if pos_pi_nes.shape[0] > 0 else
np.array([])
)
neg_mat = (
np.concatenate(neg_pi_nes.values)
if neg_pi_nes.shape[0] > 0 else
np.array([])
)
plot.plot_nes_dist(
vals['NES(S)'].values,
np.concatenate([pos_mat, neg_mat]),
)
pos_pdf = PrPDF(pos_nes.dropna().values)
neg_pdf = PrPDF(neg_nes.dropna().values)
pos_pi_pdf = PrPDF(pos_mat)
neg_pi_pdf = PrPDF(neg_mat)
LOGGER.info('Generated NES(S) distributions')
if vals.shape[0] > 0:
vals['p-value'] = vals.apply(
lambda x:
_frac_true(
_calc_p(x['ES(S)'], x['ES(S, pi)'])
),
axis=1,
)
vals['q-value'] = vals.apply(
lambda x:
_calc_q(
x['NES(S)'],
pos_pdf if x['NES(S)'] > 0 else neg_pdf,
pos_pi_pdf if x['NES(S)'] > 0 else neg_pi_pdf,
),
axis=1,
)
LOGGER.info('Calculated p, q values')
return vals
def _frac_true(x):
return sum(x) / max([len(x), 1])
[docs]def get_gene_changes(psms):
'''
Extract the gene IDs and correlation values for each gene / phosphosite in
a data set. Merge together duplicate IDs by calculating their mean
correlation.
Parameters
----------
psms : :class:`pandas.DataFrame`
'''
LOGGER.info('Getting gene correlations ({} IDs)'.format(psms.shape[0]))
gene_changes = psms[['ID', 'Correlation']].copy()
if gene_changes.shape[0] > 0:
gene_changes = gene_changes.groupby(
'ID',
as_index=False,
).agg({
'Correlation': np.mean,
})
gene_changes = gene_changes.sort_values(by='Correlation', ascending=False)
gene_changes = gene_changes.set_index(keys='ID')
return gene_changes
def _calc_essdist(
phen=None,
psms=None,
gene_sets=None,
metric='spearman',
**kwargs
):
assert metric in CORRELATION_METRICS
if phen is not None:
phen = _shuffle(phen)
if metric in ['fold', 'zscore']:
psms = psms.copy()
psms['Fold Change'] = _shuffle(psms['Fold Change'])
vals = enrichment_scores(
psms,
gene_sets,
phenotype=phen,
metric=metric,
recorrelate=True,
pval=False,
**kwargs
)
return vals['ES(S)']
[docs]def correlate_phenotype(psms, phenotype=None, metric='spearman'):
'''
Calculate the correlation values for each gene / phosphosite in a data set.
Parameters
----------
psms : :class:`pandas.DataFrame`
phenotype : :class:`pandas.Series`, optional
metric : str, optional
The correlation function to use. See CORRELATION_METRICS for a full
list of choices.
Returns
-------
psms : :class:`pandas.DataFrame`
'''
assert metric in CORRELATION_METRICS
psms = psms.copy()
if metric in ['spearman', 'pearson', 'kendall']:
LOGGER.info(
'Calculating correlations using metric \'{}\' (samples: {})'
.format(metric, list(phenotype.index))
)
psms['Correlation'] = psms.apply(
lambda row:
phenotype.corr(
pd.to_numeric(row[phenotype.index]),
method=metric,
min_periods=MIN_PERIODS,
),
axis=1,
)
else:
LOGGER.info(
'Calculating ranks'
)
new = psms['Fold Change']
if (
metric in ['log2'] or
(metric in ['zscore'] and (new > 0).all())
):
new = new.apply(np.log2)
if metric in ['zscore']:
new = (new - new.mean()) / new.std()
psms['Correlation'] = new
return psms
[docs]def calculate_es_s(gene_changes, gene_set, p=None, n_h=None, ess_method=None):
'''
Calculate the enrichment score for an individual gene set.
Parameters
----------
gene_changes : :class:`pandas.DataFrame`
gene_set : set of str
p : float, optional
n_h : int, optional
ess_method : str, optional
One of {'integral', 'max_abs', 'max_min'}.
Returns
-------
dict
'''
if p is None:
p = DEFAULT_P
if ess_method is None:
ess_method = DEFAULT_ESS_METHOD
assert ess_method in ESS_METHODS
n = len(gene_changes)
gene_set = set(
gene
for gene in gene_set
if gene in gene_changes.index
)
hits = gene_changes.index.isin(gene_set)
hit_list = gene_changes[hits].index.tolist()
if n_h is None:
n_h = len(gene_set)
n_r = gene_changes[hits]['Correlation'].apply(
lambda x: abs(x) ** p
).sum()
scores = [0] + [
((abs(val) ** p) / n_r)
if hit else
(-1 / (n - n_h))
for hit, val in zip(hits, gene_changes['Correlation'])
]
cumsum = np.cumsum(scores)
ess = ESS_METHODS.get(ess_method)(cumsum)
return {
'hits': hits,
'cumscore': cumsum,
'ess': ess,
'hit_list': hit_list,
}
[docs]def calculate_es_s_ud(gene_changes, up_set, down_set, **kwargs):
'''
Calculate the enrichment score for an individual gene set.
Parameters
----------
gene_changes : :class:`pandas.DataFrame`
gene_set : set of str
kwargs : dict, optional
See extra arguments passed to calculate_es_s.
Returns
-------
dict
'''
up_set = set(
gene
for gene in up_set
if gene in gene_changes.index
)
down_set = set(
gene
for gene in down_set
if gene in gene_changes.index
)
vals = []
for gene_set in [up_set, down_set]:
vals.append(
calculate_es_s(
gene_changes,
gene_set,
n_h=len(up_set) + len(down_set),
**kwargs
)
)
up_hits = vals[0]['hits']
down_hits = vals[1]['hits']
hit_list = vals[0]['hit_list'] + vals[1]['hit_list']
upcumsum = vals[0]['cumscore']
downcumsum = vals[1]['cumscore']
ess = (
vals[0]['ess'] if vals[0]['hits'].any() else 0
) - (
vals[1]['ess'] if vals[1]['hits'].any() else 0
)
return {
'hits': up_hits,
'down_hits': down_hits,
'cumscore': upcumsum,
'down_cumscore': downcumsum,
'ess': ess,
'hit_list': hit_list,
}
def _get_set_cols(cols):
for set_cols in [
['up_set', 'down_set'],
['set'],
]:
if any([i in cols for i in set_cols]):
return set_cols
[docs]def enrichment_scores(
psms,
gene_sets,
pval=True,
recorrelate=False,
metric=None,
phenotype=None,
**kwargs
):
'''
Calculate enrichment scores for each gene set.
p-values and q-values are calculated by scrambling the phenotypes assigned
to each sample or scrambling peptides' fold changes, depending on the
correlation metric used.
Parameters
----------
psms : :class:`pandas.DataFrame`
gene_sets : :class:`pandas.DataFrame`
pval : bool, optional
recorrelate : bool, optional
metric : str, optional
phenotype : :class:`pandas.Series`, optional
kwargs : dict, optional
See extra arguments passed to `calculate_es_s` and `simulate_es_s_pi`.
Returns
-------
df : :class:`pandas.DataFrame`
'''
if metric is None:
if phenotype is None:
metric = 'fold'
else:
metric = 'spearman'
assert metric in CORRELATION_METRICS
if recorrelate:
psms = correlate_phenotype(psms, phenotype=phenotype, metric=metric)
gene_changes = get_gene_changes(psms)
set_cols = _get_set_cols(gene_sets.columns)
cols = [
'name',
'cumscore',
'down_cumscore',
'ES(S)',
'hits',
'down_hits',
'hit_list',
'n_hits',
] + set_cols
vals = pd.DataFrame(
columns=cols,
)
ess_args = {
i: kwargs[i]
for i in ['p', 'ess_method']
if i in kwargs
}
for set_id, row in gene_sets.iterrows():
if set(set_cols) == set(['set']):
es_vals = calculate_es_s(
gene_changes,
row['set'],
**ess_args
)
else:
es_vals = calculate_es_s_ud(
gene_changes,
row['up_set'],
row['down_set'],
**ess_args
)
vals = vals.append(
pd.Series(
[
row['name'],
es_vals.get('cumscore', []),
es_vals.get('down_cumscore', []),
es_vals.get('ess', 0),
es_vals.get('hits', []),
es_vals.get('down_hits', []),
es_vals.get('hit_list', []),
len(es_vals.get('hit_list', [])),
] + [row[i] for i in set_cols],
name=set_id,
index=cols,
)
)
ess_pi_args = {
i: kwargs[i]
for i in ['p', 'p_iter', 'n_cpus', 'ess_method']
if i in kwargs
}
if pval:
vals = simulate_es_s_pi(
vals, psms, gene_sets,
phenotype=phenotype,
metric=metric,
**ess_pi_args
)
vals = estimate_pq(vals)
vals = vals.sort_values('NES(S)', ascending=False)
else:
vals = vals.sort_values('ES(S)', ascending=False)
return vals
[docs]def filter_gene_sets(gene_sets, psms, min_hits=10):
'''
Filter gene sets to include only those with at least a given number of
hits in a data set.
Parameters
----------
gene_sets : :class:`pandas.DataFrame`
psms : :class:`pandas.DataFrame`
min_hits : int, optional
Returns
-------
df : :class:`pandas.DataFrame`
'''
LOGGER.info('Filtering gene sets')
total_sets = gene_sets
all_genes = set(psms['ID'])
set_cols = _get_set_cols(gene_sets.columns)
combined_set = gene_sets.apply(
lambda x:
set(
i
for col in set_cols
for i in x[col]
),
axis=1,
)
gene_sets = gene_sets[
combined_set.apply(
lambda x:
len(x) < len(all_genes) and
len(all_genes.intersection(x)) >= min_hits
)
]
LOGGER.info(
'Filtered {} gene sets down to {} with ≥ {} genes present'
.format(total_sets.shape[0], gene_sets.shape[0], min_hits)
)
return gene_sets
[docs]def filter_vals(
vals,
min_hits=0,
min_abs_score=0,
max_pval=1,
max_qval=1,
):
'''
Filter gene set enrichment scores using give ES(S) / p-value / q-value
cutoffs.
Parameters
----------
vals : :class:`pandas.DataFrame`
min_hits : int, optional
min_abs_score : float, optional
max_pval : float, optional
max_qval : float, optional
Returns
-------
df : :class:`pandas.DataFrame`
'''
n_len = len(vals)
if vals.shape[0] > 0:
vals = vals[
vals.apply(
lambda x:
x['n_hits'] >= min_hits and
abs(x['ES(S)']) >= min_abs_score and (
(x['p-value'] <= max_pval)
if 'p-value' in x.index else
True
) and (
(x['q-value'] <= max_qval)
if 'q-value' in x.index else
True
),
axis=1,
)
]
LOGGER.info(
'Filtered {} gene sets down to {} after cutoffs'
.format(n_len, len(vals))
)
return vals