Source code for pyproteome.pathways

# -*- coding: UTF-8 -*-
"""
This module provides functionality for signal pathway analysis.

It includes functions for Gene Set Enrichment Analysis (GSEA) as well as
Phospho Set Enrichment Analysis (PSEA).
"""

from __future__ import absolute_import, division

from collections import OrderedDict
import logging
import os

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt


import pyproteome as pyp
import brainrnaseq as brs
from . import (
    binomial,
    enrichments,
    go,
    gskb,
    msigdb,
    pathwayscommon,
    photon_ptm,
    plot,
    plsr,
    psp,
    ptmsigdb,
    wikipathways,
)


LOGGER = logging.getLogger("pyproteome.pathways")


def _remap_data(psms, from_species, to_species):
    new = psms.copy()

    from_species = pyp.species.ORGANISM_MAPPING.get(from_species, from_species)
    to_species = pyp.species.ORGANISM_MAPPING.get(to_species, to_species)

    LOGGER.info(
        "Remapping phosphosites from {} to {}".format(from_species, to_species)
    )

    mapping = psp.get_phosphomap_data()
    mapping = mapping[["ACC_ID", "MOD_RSD", "ORGANISM", "SITE_GRP_ID"]]

    acc_mapping = mapping[
        mapping["ORGANISM"] == from_species
    ].set_index(
        ["ACC_ID", "MOD_RSD"]
    ).sort_index()
    site_mapping = mapping[
        mapping["ORGANISM"] == to_species
    ].set_index(
        ["SITE_GRP_ID"]
    ).sort_index()

    del mapping

    def _remap(val):
        acc, mod = val.split(",")

        try:
            site = acc_mapping.loc[acc, mod]
        except KeyError:
            return None

        site = site["SITE_GRP_ID"]
        if hasattr(site, 'iloc'):
            site = site.iloc[0]

        try:
            re_map = site_mapping.loc[site]
        except KeyError:
            return None

        if len(re_map.shape) > 1:
            re_map = re_map.iloc[0]

        acc = re_map["ACC_ID"]
        mod = re_map["MOD_RSD"]
        return ",".join([acc, mod])

    new["ID"] = new["ID"].apply(_remap)
    new = new[~(new["ID"].isnull())]

    LOGGER.info(
        "Mapping {} IDs ({}) down to {} IDs ({})"
        .format(psms.shape[0], from_species, new.shape[0], to_species)
    )

    return new


def _get_psite_ids(ds, species):
    LOGGER.info("Building list of individual phosphosites")
    new_rows = []

    def _split_rows(row):
        mods = row["Modifications"].get_mods("Phospho")

        # Generate rows for each phosphosite on a peptide mapped to each
        # possible protein for ambiguous peptides.
        for mod in mods:
            for gene, abs_pos in zip(row["Proteins"].accessions, mod.abs_pos):
                new_row = row.to_dict()

                mod_name = "{}{}-p".format(mod.letter, abs_pos + 1)

                new_row["ID"] = ",".join([gene, mod_name])
                new_rows.append(new_row)

    ds.psms.apply(_split_rows, axis=1)

    df = pd.DataFrame(new_rows, columns=list(ds.psms.columns) + ["ID"])

    return df


def _get_protein_ids(psms, species):
    if "Proteins" in psms.columns:
        return psms["Proteins"].apply(
            lambda row:
            brs.mapping.get_entrez_mapping(
                row.genes[0],
                species=species,
            ),
        )
    else:
        return psms["Gene"].apply(
            lambda row:
            brs.mapping.get_entrez_mapping(
                row,
                species=species,
            ),
        )


[docs]def get_pathways(species, p_sites=False, remap=False): """ Download all default gene sets and phospho sets. Parameters ---------- species : str Target species to use to generate gene / phospho sets. p_sites : bool, optional Build phospho sets if true else build gene sets. remap : bool, optional Remap proteins / phosphosites from all species to the target species. Returns ------- df : :class:`pandas.DataFrame` """ LOGGER.info( "Building gene sets (psites={}, remap={})" .format(p_sites, remap) ) if p_sites: pathways_df = psp.get_phosphosite(species, remap=remap) # pathways_df = pathways_df.append( # get_phosphosite_regulation(species, remap=remap) # ) else: pathways_df = wikipathways.get_wikipathways(species) if species in ["Homo sapiens", "human"] or remap: # pathways_df = pathways_df.append( # pathwayscommon.get_pathway_common(species) # ) pathways_df = pathways_df.append( msigdb.get_msigdb_pathways(species, remap=remap) ) # if species in ["Mus musculus"]: # pathways_df = pathways_df.append(gskb.get_gskb_pathways(species)) LOGGER.info("Loaded {} gene sets".format(pathways_df.shape[0])) return pathways_df.reset_index()
def _filter_ambiguous_peptides(ds): LOGGER.info( "Filtering ambiguous peptides ({} proteins)".format(len(set(ds.genes))) ) ds = ds.filter(ambiguous=False) LOGGER.info("Filtered down to {} proteins".format(len(set(ds.genes)))) return ds def _get_scores(psms, phenotype=None, metric=None): if metric is None: if phenotype is None: metric = "fold" else: metric = "spearman" LOGGER.info("Building correlations using metric '{}'".format(metric)) psms = psms[~psms["ID"].isnull()] agg = {} if "Fold Change" in psms.columns: agg["Fold Change"] = np.nanmedian agg["Fold Change"] = np.max if phenotype is not None: agg.update({ chan: np.nanmedian # chan: np.max for chan in phenotype.index }) psms = psms.groupby( by="ID", as_index=False, ).agg(agg) if phenotype is not None and metric in ["spearman", "pearson"]: phenotype = pd.to_numeric(phenotype) psms = psms[ (~psms[phenotype.index].isnull()).sum(axis=1) >= enrichments.MIN_PERIODS ] psms = enrichments.correlate_phenotype( psms, phenotype=phenotype, metric=metric, ) if "Correlation" in psms.columns: psms = psms[ ~psms["Correlation"].isnull() ] return psms def _get_set_name(cols): for name in [ ["hit_list"], ["set"], ["up_set", "down_set"], ]: if any([i in cols for i in name]): return name
[docs]def filter_fn(vals, ds=None, species=None): if isinstance(vals, pd.Series): col_names = _get_set_name(vals.index) else: col_names = _get_set_name(vals.columns) def _p_site_isin(row): return any([ (acc, mod.letter, abs_pos + 1) in v_set for acc in row["Proteins"].accessions for mod in row["Sequence"].modifications.mods for abs_pos in mod.abs_pos ]) if not species and ds: species = list(ds.species)[0] def _gene_isin(row): return any([ brs.mapping.get_entrez_mapping(gene, species=species) in v_set for gene in row["Proteins"].genes ]) if isinstance(vals, pd.Series): p_sites = any( "," in i for col in col_names for i in vals[col] ) else: p_sites = ( vals.shape[0] > 0 and any( "," in list(vals.iloc[0][col])[0] for col in col_names ) ) if p_sites: if isinstance(vals, pd.Series): v_set = set( ( x.split(",")[0], x.split(",")[1][0], int(x.split(",")[1][1:].split("-")[0]), ) for col in col_names for x in vals[col] ) else: v_set = set( tup for col in col_names for val_set in vals[col].apply( lambda s: set( ( x.split(",")[0], x.split(",")[1][0], int(x.split(",")[1][1:].split("-")[0]), ) for x in s ), ) for tup in val_set ) fn = _p_site_isin else: if isinstance(vals, pd.Series): v_set = set( i for col in col_names for i in vals[col] ) else: v_set = set( acc for col in col_names for set in vals[col] for acc in set ) fn = _gene_isin return fn
[docs]def gsea( psms=None, ds=None, gene_sets=None, metric=None, phenotype=None, species=None, min_hits=10, p_sites=False, remap=True, name=None, show_plots=True, **kwargs ): """ Perform Gene Set Enrichment Analysis (GSEA) on a data set. Parameters not listed below will be passed on to the underlying enrichments module. See :func:`pyproteome.pathways.enrichments.plot_gsea` for a full list of arguments. Parameters ---------- psms : :class:`pandas.DataFrame`, optional ds : :class:`pyproteome.data_sets.DataSet`, optional The data set to perform enrichment analysis on. phenotype : :class:`pandas.Series`, optional A series object with index values equal to the quantification columns in the data set. This object is used when calculating correlation statistics for each peptide. gene_sets : :class:`pandas.DataFrame`, optional A dataframe with two columns: "name" and "set". Each element of set should be a Python set() object containing all the gene IDs for each gene set. Gene IDs should be strings of Entrez Gene IDs for protein sets and strings of "<Entrez>,<letter><pos>-p" (i.e. "8778,Y544-p") for phospho sets. metric : str, optional Correlation metric to use. One of ["zscore", "fold", "spearman", "pearson", "kendall"]. phenotype : :class:`pandas.Series`, optional species : str, optional The species used to generate gene sets. Value should be in binomial nomenclature (i.e. "Homo sapiens", "Mus musculus"). If different from that of the input data set, IDs will be mapped to the target species using Phosphosite Plus's database. min_hits : int, optional p_sites : bool, optional Perform Phospho Set Enrichment Analysis (PSEA) on data set. remap : bool, optional Remap database of phosphosites using information from all species. name : str, optional The name of this analysis. Defaults to `ds.name`. show_plots : bool, optional See Also -------- :func:`pyproteome.pathways.enrichments.enrichment_scores` :func:`pyproteome.pathways.enrichments.plot_gsea` Returns ------- vals : :class:`pandas.DataFrame` gene_changes : :class:`pandas.DataFrame` """ assert psms is not None or ds is not None if ds is not None: ds = _filter_ambiguous_peptides(ds) # Don't scramble peptides without quantification values ds = ds.dropna(columns=['Fold Change']) if species is None: species = list(ds.species)[0] if name is None: name = "{}-{}".format( ds.name, "PSEA" if p_sites else "GSEA", ) if p_sites: psms = _get_psite_ids(ds, species) else: psms = ds.psms.copy() psms["ID"] = _get_protein_ids(psms, species) if species not in ds.species: psms = _remap_data( psms, from_species=list(ds.species)[0], to_species=species, ) else: psms["ID"] = _get_protein_ids(psms, species) if gene_sets is None: gene_sets = get_pathways(species, p_sites=p_sites, remap=remap) psms = _get_scores( psms, phenotype=phenotype, metric=metric, ) es_args = { "phenotype": phenotype, "metric": metric, } es_args.update({ i: kwargs.pop(i) for i in ["p", "pval", "p_iter", "n_cpus", 'ess_method'] if i in kwargs }) gene_changes = enrichments.get_gene_changes(psms) gene_sets = enrichments.filter_gene_sets( gene_sets, psms, min_hits=min_hits, ) vals = enrichments.enrichment_scores( psms, gene_sets, **es_args ) if show_plots: figures = plot.plot_gsea( vals, gene_changes, **kwargs ) else: figures = [] return vals, gene_changes, figures
[docs]def psea(*args, **kwargs): """ Perform Gene Set Enrichment Analysis (GSEA) on a data set. See :func:`pyproteome.pathways.gsea` for documentation and a full list of arguments. Returns ------- df : :class:`pandas.DataFrame`, optional """ kwargs["p_sites"] = True return gsea(*args, **kwargs)
[docs]def ssgsea( ds=None, thres_na=None, *args, **kwargs ): assert ds is not None cmp_groups = ds.cmp_groups or [list(ds.groups.keys())] if ds.cmp_groups is None: ds = ds.norm_cmp_groups(cmp_groups) samples = [ sample for cmp_group in cmp_groups for group in cmp_group for sample in ds.groups[group] if sample in ds.channels ] channels = [ ds.channels[sample] for sample in samples ] gsea_vals = OrderedDict() assert kwargs.get("metric", "fold") in ["log2", "fold", "zscore"] LOGGER.info( "Calculating ssGSEA scores for {} samples".format(len(samples)) ) for ind, (sample, chan) in enumerate(zip(samples, channels), start=1): LOGGER.info( "-- ssGSEA {}/{}: {} ({})".format(ind, len(samples), sample, chan) ) ds.psms["Fold Change"] = ds.psms[chan] vals, gene_changes, figures = gsea(ds=ds, show_plots=False, *args, **kwargs) gsea_vals[sample] = vals cols = [ "sample", "name", "ES(S)", "NES(S)", "n_hits", "p-value", "q-value", ] for name in gsea_vals: gsea_vals[name]["sample"] = name cols = [ i for i in cols if any([i in vals.columns for vals in gsea_vals.values()]) ] df_ssgsea = pd.concat([ vals[cols] for vals in gsea_vals.values() ]) df_ssgsea["sort_index"] = df_ssgsea["sample"].apply( lambda x: samples.index(x) ) df_ssgsea = df_ssgsea.sort_values("sort_index", ascending=True) plot_ssgsea_heatmap( df_ssgsea, ds=ds, max_qval=kwargs.get("max_qval", 1), ) return df_ssgsea
def plot_ssgsea_heatmap( df, max_qval=1, thres_na=None, ds=None, ax=None, ): if ds is not None: cmp_groups = [ds.group_b, ds.group_a] or [list(ds.groups.keys())] samples = [ sample for cmp_group in cmp_groups for group in cmp_group for sample in ds.groups[group] if sample in ds.channels ] else: samples = df["sample"].drop_duplicates() LOGGER.info("Plotting ssGSEA NES(S) heatmap") df["sig"] = df["q-value"].apply( lambda x: x < max_qval ) df["sig_symbol"] = df["sig"].apply( lambda x: "*" if x else "" ) if thres_na is None: thres_na = len(set(df["sample"])) / 3 thres_sig = 4 filtered_names = [ name for name in set(df["name"]) if (df["name"] == name).sum() >= thres_na and ((df["name"] == name) & df["sig"]).sum() >= thres_sig ] df = df[df["name"].isin(filtered_names)] if ax is None: f, ax = plt.subplots( figsize=(8, 6), ) sns.heatmap( df.pivot("name", "sample", "NES(S)")[samples], ax=ax, annot=df.pivot("name", "sample", "sig_symbol")[samples], fmt="s", # vmin=-5, # vmax=5, cmap=plt.cm.Spectral_r, ) ax.set_yticklabels( ax.get_yticklabels(), rotation=0, ) return ax.get_figure()
[docs]def sspsea(*args, **kwargs): kwargs["p_sites"] = True return ssgsea(*args, **kwargs)
__all__ = [ 'psea', 'gsea', 'ssgsea', 'sspsea', 'filter_fn', 'get_pathways', 'binomial', 'enrichments', 'go', 'gskb', 'msigdb', 'pathwayscommon', 'photon_ptm', 'plsr', 'psp', 'ptmsigdb', 'wikipathways', ]