Source code for pyproteome.pathways.msigdb


import logging
import requests

import pandas as pd

import pyproteome as pyp


MSIGDB_URL = (
    'https://raw.githubusercontent.com/white-lab/pyproteome-data'
    '/master/msigdb/'
)
MSIGDB_VERSION = 'v7.1'
MSIGDB_FILES = tuple([
    i.format(MSIGDB_VERSION)
    for i in [
        'h.all.{}.entrez.gmt',
        # 'c1.all.{}.entrez.gmt',
        # 'c2.all.{}.entrez.gmt',
        # 'c2.cgp.{}.entrez.gmt',
        # 'c2.cp.biocarta.{}.entrez.gmt',
        # 'c2.cp.kegg.{}.entrez.gmt',
        # 'c2.cp.reactome.{}.entrez.gmt',
        # 'c2.cp.{}.entrez.gmt',
        # 'c3.all.{}.entrez.gmt',
        # 'c3.mir.{}.entrez.gmt',
        # 'c3.tft.{}.entrez.gmt',
        # 'c4.all.{}.entrez.gmt',
        # 'c4.cgn.{}.entrez.gmt',
        # 'c4.cm.{}.entrez.gmt',
        # 'c5.all.{}.entrez.gmt',
        # 'c5.bp.{}.entrez.gmt',
        # 'c5.cc.{}.entrez.gmt',
        # 'c5.mf.{}.entrez.gmt',
        # 'c6.all.{}.entrez.gmt',
        # 'c7.all.{}.entrez.gmt',
    ]
])

LOGGER = logging.getLogger('pyproteome.msigdb')

try:
    from genemap.mappers import EnsemblMapper
except ImportError:
    pass


[docs]@pyp.utils.memoize def get_msigdb_pathways(species, remap=None): ''' Download gene sets from MSigDB. Currently downloads v7.0 of the gene signature repositories. Parameters ---------- species : str Returns ------- df : :class:`pandas.DataFrame`, optional ''' LOGGER.info('Fetching MSigDB pathways') def _get_requests(): for file in MSIGDB_FILES: url = MSIGDB_URL + file LOGGER.info('Fetching {}'.format(url)) response = requests.get(url, stream=True) response.raise_for_status() yield response def _get_data(line): line = line.decode('utf-8') name, _, genes = line.split('\t', 2) # name, _, _, spec = name.split('%') # assert species == spec return name, set(i for i in genes.split('\t')) pathways_df = pd.DataFrame( data=[ _get_data(line) for response in _get_requests() for line in response.iter_lines() ], columns=['name', 'set'], ) if remap and species not in ['Homo sapiens']: to_name = '{}{}'.format( species.split(' ')[0][0], species.split(' ')[1], ).lower() LOGGER.info('Remapping MSigDB to {} ({})'.format(species, to_name)) mapper = EnsemblMapper( from_type='entrez', to_type='entrez', from_organism='hsapiens', to_organism=to_name, ) pathways_df['set'] = pathways_df['set'].apply( lambda row: set(mapper.map_ids(row)) ) return pathways_df