Source code for pyproteome.pathways.psp


import gzip
import io
import logging
import requests

import pandas as pd

import pyproteome as pyp

LOGGER = logging.getLogger('pyproteome.phosphosite')

PSP_REGULATORY_URL = (
    'https://www.phosphosite.org/downloads/Regulatory_sites.gz'
)
PSP_SITE_MAPPING_URL = (
    'https://www.phosphosite.org/downloads/Phosphorylation_site_dataset.gz'
)


[docs]@pyp.utils.memoize def get_phosphomap_data(): ''' Fetch mapping between phosphorylation sites of different species. Returns ------- df : :class:`pandas.DataFrame` ''' LOGGER.info('Fetching Phosphosite Plus mapping data') url = PSP_SITE_MAPPING_URL r = requests.get(url, stream=True) r.raise_for_status() gz = gzip.GzipFile(fileobj=io.BytesIO(r.content)) return pd.read_csv(gz, skiprows=[0, 1, 2], sep='\t')
[docs]@pyp.utils.memoize def get_phosphoreg_data(): ''' Fetch Phosphosite Plus regulation data. Returns ------- df : :class:`pandas.DataFrame` ''' LOGGER.info('Fetching Phosphosite Plus regulation data') url = PSP_REGULATORY_URL r = requests.get(url, stream=True) r.raise_for_status() gz = gzip.GzipFile(fileobj=io.BytesIO(r.content)) return pd.read_table(gz, skiprows=[0, 1, 2], sep='\t', usecols=range(21))
[docs]@pyp.utils.memoize def get_phosphosite(species, remap=False): ''' Download phospho sets from PhophoSite Plus. Parameters ---------- species : str remap : bool, optional Returns ------- df : :class:`pandas.DataFrame`, optional ''' LOGGER.info('Getting phosphosite data for {}'.format(species)) species = pyp.species.ORGANISM_MAPPING.get(species, species) psp_data = pyp.motifs.phosphosite.get_data() if remap: psp_data = _remap_psp( psp_data, species, acc_col='SUB_ACC_ID', mod_col='SUB_MOD_RSD', org_col='SUB_ORGANISM', ) psp_data = psp_data[psp_data['SUB_ORGANISM'] == species] return pd.DataFrame( [ ( kinase, set( psp_data[ psp_data['KINASE'] == kinase ].apply( lambda x: ','.join([ x['SUB_ACC_ID'].split('-')[0] if isinstance(x['SUB_ACC_ID'], str) else '', x['SUB_MOD_RSD'] + ('' if remap else '-p'), ]), axis=1, ) ), set(), ) for kinase in set(psp_data['KINASE']) ], columns=['name', 'up_set', 'down_set'] )
[docs]@pyp.utils.memoize def get_phosphosite_regulation(species, remap=False): ''' Download phospho sets from PhophoSite Plus. Parameters ---------- species : str remap : bool, optional Returns ------- df : :class:`pandas.DataFrame`, optional ''' LOGGER.info('Getting phosphosite regulation data for {}'.format(species)) species = pyp.species.ORGANISM_MAPPING.get(species, species).lower() psp_data = get_phosphoreg_data() if remap: psp_data = _remap_psp( psp_data, species, set_col='ON_PROCESS', acc_col='ACC_ID', mod_col='MOD_RSD', org_col='ORGANISM', append_mod='', ) psp_data = psp_data[psp_data['ORGANISM'] == species] paths = set( proc.strip() for row in psp_data['ON_PROCESS'] if pd.notna(row) for proc in row.split(';') if proc.strip() ) return pd.DataFrame( [ ( path, set( psp_data[ psp_data['ON_PROCESS'].apply( lambda x: pd.notna(x) and any([ path in i.strip() for i in x.split(';') ]) ) ].apply( lambda x: ','.join([ x['ACC_ID'].split('-')[0], x['MOD_RSD'], ]), axis=1, ) ), set(), ) for path in paths ], columns=['name', 'up_set', 'down_set'] )
def _remap_psp( psp, species, set_col='KINASE', acc_col='ACC_ID', mod_col='MOD_RSD', org_col='ORGANISM', append_mod='-p', mapping=None, ): LOGGER.info('Remapping sites to species: {}'.format(species)) if mapping is None: mapping = get_phosphomap_data() mapping = mapping[['ACC_ID', 'MOD_RSD', 'ORGANISM', 'SITE_GRP_ID']] mod_mapping = mapping[ mapping['ORGANISM'] != species ].set_index( ['ACC_ID', 'MOD_RSD', 'ORGANISM'] ).sort_index() site_mapping = mapping[ mapping['ORGANISM'] == species ].set_index( 'SITE_GRP_ID' ).sort_index() del mapping new_index = [org_col, set_col, acc_col, mod_col] def _remap(row): kinase, acc, mod, old_species = row[ [set_col, acc_col, mod_col, org_col] ] mod += append_mod if old_species != species: # Remap the phosphorylation site if possible try: site = mod_mapping.loc(axis=0)[acc, mod, old_species] except KeyError: pass else: site = site['SITE_GRP_ID'] if hasattr(site, 'iloc'): site = site.iloc[0] try: re_map = site_mapping.loc[site] except KeyError: pass else: if len(re_map.shape) > 1: re_map = re_map.iloc[0] acc, mod = re_map[['ACC_ID', 'MOD_RSD']] old_species = species return pd.Series([ old_species, kinase, acc, mod, ], index=new_index) return psp.apply(_remap, axis=1)