Source code for pyproteome.pathways.wikipathways


import io
import logging
import requests
import re
import zipfile
import xml.etree.ElementTree as ET

import pandas as pd

import pyproteome as pyp

WIKIPATHWAYS_GMT_URL = (
    'http://data.wikipathways.org/{date}/gmt/'
    'wikipathways-{date}-gmt-{species}.gmt'
)
WIKIPATHWAYS_GMPL_URL = (
    'http://data.wikipathways.org/{date}/gpml/'
    'wikipathways-{date}-gpml-{species}.zip'
)
WIKIPATHWAYS_NS = {
    'wp': 'http://pathvisio.org/GPML/2013a',
}
RE_WIKIPATHWAYS = re.compile(
    r'parent=([^;]+); position=([^;]+); ptm=([^;]+); direction=([^;]+)'
)

LOGGER = logging.getLogger('pyproteome.wikipathways')


@pyp.utils.memoize
def _get_wp_date():
    return '20180710'


[docs]@pyp.utils.memoize
def get_wikipathways(species):
    '''
    Download gene sets from WikiPathways.

    Parameters
    ----------
    species : str

    Returns
    -------
    df : :class:`pandas.DataFrame`, optional
    '''
    LOGGER.info('Fetching WikiPathways')

    species = pyp.species.INV_ORGANISM_MAPPING.get(species, species)

    url = WIKIPATHWAYS_GMT_URL.format(
        date=_get_wp_date(),
        species='_'.join(species.split(' ')),
    )
    response = requests.get(url, stream=True)
    response.raise_for_status()

    def _get_data(line):
        line = line.decode('utf-8')
        name, _, genes = line.split('\t', 2)
        name, _, _, spec = name.split('%')
        assert species == spec
        return name, set(i for i in genes.split('\t'))

    pathways_df = pd.DataFrame(
        data=[
            _get_data(line)
            for line in response.iter_lines()
        ],
        columns=['name', 'set'],
    )

    return pathways_df


[docs]@pyp.utils.memoize
def get_wikipathways_psites(species):
    '''
    Download phospho sets from WikiPathways.

    Parameters
    ----------
    species : str

    Returns
    -------
    df : :class:`pandas.DataFrame`, optional
    '''
    LOGGER.info('Fetching WikiPathways')

    species = pyp.species.INV_ORGANISM_MAPPING.get(species, species)

    url = WIKIPATHWAYS_GMPL_URL.format(
        date=_get_wp_date(),
        species='_'.join(species.split(' ')),
    )
    response = requests.get(url, stream=False)
    response.raise_for_status()

    z = zipfile.ZipFile(io.BytesIO(response.content))
    LOGGER.info('Parsing WikiPathways phosphosites')

    def _process_site(sname):
        sname = re.sub('ser', 'S', sname)
        sname = re.sub('thr', 'T', sname)
        sname = re.sub('tyr', 'Y', sname)
        return sname

    def _get_data(name):
        f = z.open(name)
        root = ET.fromstring(f.read())
        sites = root.findall(
            'wp:State/wp:Comment',
            WIKIPATHWAYS_NS,
        )
        # print(sites)
        matches = [
            RE_WIKIPATHWAYS.match(i.text)
            for i in sites
            if i.text
        ]
        matches = [
            (
                m.group(1),
                _process_site(m.group(2)),
                m.group(4),
            )
            for m in matches
            if m and m.group(3) == 'p'
        ]
        up = [i for i in matches if i[2] == 'u']
        down = [i for i in matches if i[2] == 'd']
        # parent=Q9NQB0; position=thr212; ptm=p; direction=d

        tmp = name.replace('_', ' ').rsplit('.', 1)[0].split(' ', 1)[1]
        title, id = tmp.split(' WP')
        id = 'WP' + id

        return (
            title,
            id,
            set(
                '{},{}-p'.format(i[0], i[1])
                for i in up
            ),
            set(
                '{},{}-p'.format(i[0], i[1])
                for i in down
            ),
        )

    data = [
        _get_data(name)
        for name in z.namelist()
    ]
    data = [
        i
        for i in data
        if i[1] or i[2]
    ]

    pathways_df = pd.DataFrame(
        data=data,
        columns=['name', 'WikiPathways ID', 'upregulated', 'downregulated'],
    )
    pathways_df = pathways_df[
        pathways_df['upregulated'].apply(len) +
        pathways_df['downregulated'].apply(len) > 5
    ]

    return pathways_df