Source code for pyproteome.pathways.gskb
import logging
import requests
import pandas as pd
import pyproteome as pyp
LOGGER = logging.getLogger('pyproteome.gskb')
GSKB_URL = (
'http://ge-lab.org/gskb/2-MousePath/mGSKB_Entrez.gmt'
)
[docs]@pyp.utils.memoize
def get_gskb_pathways(species):
'''
Download gene sets from GSKB.
Parameters
----------
species : str
Returns
-------
df : :class:`pandas.DataFrame`, optional
'''
LOGGER.info('Fetching GSKB pathways')
url = GSKB_URL
r = requests.get(url, stream=True)
r.raise_for_status()
def _get_data(line):
line = line.decode('windows-1252')
name, _, genes = line.split('\t', 2)
genes = set(i for i in genes.split('\t') if i)
return name, genes
pathways_df = pd.DataFrame(
data=[
_get_data(line)
for ind, line in enumerate(r.iter_lines())
if ind > 0
],
columns=['name', 'set'],
)
return pathways_df