Source code for pyproteome.pathways.photon_ptm


from collections import OrderedDict
import io
import logging
import requests
import os
import tarfile
import uuid

import pyproteome as pyp
import brainrnaseq as brs
import pandas as pd
import numpy as np

LOGGER = logging.getLogger('pathways.photon_ptm')

try:
    from genemap.mappers import EnsemblMapper
except ImportError:
    pass


parameters = {
    'activity': {
        'min_size': 4,
        'permutations': 1000,
        'side': 'greater',
    },
    'anat': {
        'alpha': 0.25,
        'anchor': -1,
    },
    'go': {
        'max_category_size': 500,
        'min_category_size': 5,
        'max_category_depth': 10,
    },
    'ppi-network': {
        'confidence': 0.5,
        'degree_threshold': 150,
    }
}


@pyp.utils.memoize
def _get_anat(dir, force=False):
    if os.path.exists(os.path.join(dir, 'db')) and not force:
        return

    url = 'http://cs.tau.ac.il/~jandanielr/db.tar.gz'

    LOGGER.info('Fetching PHOTON database from {} to {}'.format(url, dir))

    r = requests.get(url, stream=True)
    r.raw.decode_content = True
    r.raise_for_status()

    tar_file = tarfile.open(
        fileobj=io.BytesIO(r.content),
        mode='r',
    )
    tar_file.extractall(dir)

    return tar_file


@pyp.utils.memoize
def _map_gene(mapper, symbol_mapping, gene, species):
    symbol = gene
    entrez = brs.mapping.get_entrez_mapping(symbol, species=species)

    if species.lower().replace(' ', '_') not in ['homo_sapiens', 'human']:
        entrez = [i for i in mapper.map_ids([entrez]) if i]

        if not entrez:
            return None, None

        entrez = int(entrez[0])
        symbol = symbol_mapping.loc[entrez]

    return entrez, symbol


def _get_templates(template_dir):
    pyp.utils.makedirs(template_dir)

    url = (
        'https://raw.githubusercontent.com/jdrudolph/photon/'
        'c73d1eb7f5e7cab86031e056350c7b09fb5e1d51/templates/result.html'
    )
    r = requests.get(url)
    r.raise_for_status()

    with open(os.path.join(template_dir, 'result.html'), 'wb') as f:
        f.write(r.content)


[docs]def photon(ds, folder_name=None, write_output=False, log2=True): ''' Run PHOTON algorithm on a data set to find functional phosphorylation sites using protein-protein interaction networks. Parameters ---------- ds : :class:`pyproteome.data_sets.DataSet` folder_name : str, optional log2 : bool, optional Returns ------- out_path : str Path to results directory. ''' import phos import phos.defaults import phos.pipeline ds = ds.filter(fn=lambda x: len(x['Proteins']) < 2) ds = ds.filter(fn=lambda x: len(x['Scan']) >= 2) ds = ds.filter(mod='Phospho') species = list(ds.species)[0] from_name = '{}{}'.format( species.split(' ')[0][0], species.split(' ')[1], ).lower() species = species.replace(' ', '_') mapper = EnsemblMapper( from_type='entrez', to_type='entrez', from_organism=from_name, to_organism='hsapiens', ) symbol_mapping = brs.cache.get_mapping_data(species='Homo sapiens') symbol_mapping['Symbol'] = symbol_mapping.index symbol_mapping = symbol_mapping.set_index('GeneID')['Symbol'] def _get_phos_data(psms): hit = 0 miss = 0 for _, row in psms.iterrows(): gene = row['Proteins'].genes[0] entrez, symbol = _map_gene(mapper, symbol_mapping, gene, species) if not entrez: # print(gene, entrez, symbol) miss += 1 continue hit += 1 for mod in row['Modifications'].get_mods('Phospho'): yield pd.Series(OrderedDict([ ('GeneID', entrez), ('Amino.Acid', mod.letter), ('Position', 1 + mod.abs_pos[0]), ('avg', np.log2(row['Fold Change']) if log2 else row['Fold Change']), ('Symbol', symbol), ])) print(hit, miss) LOGGER.info('Generated data frame: {}, {}'.format(ds.name, ds.shape)) df = pd.DataFrame( list(_get_phos_data(ds.psms)) ).dropna() LOGGER.info('Generated data frame: {}'.format(df.shape)) df = df.sort_values('avg', ascending=False) name = str(uuid.uuid4()) _parameters = parameters.copy() _parameters['anat']['anchor'] = 1950 # XXX: Better directory for files that Photon must download for its use? dir = os.path.join(pyp.paths.FIGURES_DIR) defaults = phos.defaults.make_defaults(dir) _get_anat(dir) template_dir = os.path.join(defaults['root'], 'templates') _get_templates(template_dir) folder_name = pyp.utils.make_folder( data=ds, folder_name=folder_name, sub=os.path.join('Photon', name), ) csv_path = os.path.join(folder_name, 'results.csv') with open(csv_path, 'w') as csv_file: df.to_csv(csv_file, index=False) if write_output: phos.pipeline.run( name, csv_path, _parameters, template_dir, folder_name, defaults['db'], ) LOGGER.info('Wrote results to: {}'.format(folder_name)) return folder_name else: exp, scores, subnet, go_scores, predictions = phos.pipeline._run( name, csv_path, _parameters, # template_dir, # folder_name, defaults['db'], ) return exp, scores, subnet, go_scores, predictions