Source code for pyproteome.pathways.pathwayscommon


import gzip
import io
import logging
import re
import requests

import pandas as pd

import pyproteome as pyp
import brainrnaseq as brs

LOGGER = logging.getLogger('pyproteome.pathwayscommon')

PATHWAYS_COMMON_URL = (
    'http://www.pathwaycommons.org/archives/PC2/v9/'
    'PathwayCommons9.All.hgnc.gmt.gz'
)


[docs]@pyp.utils.memoize def get_pathway_common(species): ''' Download gene sets from Pathway Commons. Parameters ---------- species : str Returns ------- df : :class:`pandas.DataFrame`, optional ''' LOGGER.info('Fetching Pathways Common') url = PATHWAYS_COMMON_URL r = requests.get(url, stream=True) r.raise_for_status() name_re = re.compile( 'name: (.+); datasource: (.+); organism: (.+); idtype: (.+)' ) def _get_data(line): line = line.decode('utf-8') _, name, genes = line.split('\t', 2) name = name_re.match(name) name = { 'name': name.group(1), 'datasource': name.group(2), 'organism': name.group(3), 'id_type': name.group(4), } assert int(name['organism']) == 9606 genes = set( brs.mapping.get_entrez_mapping(i, species=species) for i in genes.split('\t') ) return name['name'], genes pathways_df = pd.DataFrame( data=[ _get_data(line) for line in gzip.GzipFile(fileobj=io.BytesIO(r.content)) ], columns=['name', 'set'], ) return pathways_df