Source code for pyproteome.pypuniprot

'''
This module provides functionality for fetching protein data from UniProt.

Caches fetched protein data for faster re-use.
'''

import os
import re
import shutil
import sqlite3
import tempfile

import pandas as pd
import uniprot

import pyproteome as pyp


RE_ACCESSION = re.compile(r'\[([A-Za-z0-9]+_[A-Z]+)\]')
RE_DISCOVERER_ACCESSION = re.compile(
    r'^'
    r'(gi\|([\dA-Za-z]+)\|)?'
    r'(uc\|(([\dA-Za-z]+)\|)?([\dA-Za-z\.]+\|)?)?'
    r'(ref\|([\dA-Za-z\._]+)\|)?'
    r'(gb\|([\dA-Za-z\._]+)\|)?'
    r'(gnl\|[\dA-Za-z]+\|)?'
    r'(sp\|([\dA-Za-z\-]+)\|)?'
    r' ?([\dA-Za-z_\:\-]+) .+$'
)

UNIPROT_DATA = {}


[docs]def fetch_uniprot_data(accessions): ''' Fetch UniProt protein descriptions, gene names, sequences, etc. All information is stored in UNIPROT_DATA and can be accessed with :func:`.get_uniprot_data`. Parameters ---------- accessions : list of str Returns ------- dict of str, dict ''' accessions = set(accessions).difference(UNIPROT_DATA) if not accessions: return {} cache_dir = tempfile.mkdtemp(suffix='uniprot') UNIPROT_DATA.update( uniprot.get_metadata_with_some_seqid_conversions( accessions, cache_dir=cache_dir, ) ) shutil.rmtree(cache_dir) return { i: UNIPROT_DATA.get(i, {}) for i in accessions }
[docs]def prefetch_all_uniprot(): ''' Fetch data for all accesions found in MS Searched directory. Pulls all UniProt accession IDs from all '_psms.txt' files. ''' accessions = set() for filename in os.listdir(pyp.paths.MS_SEARCHED_DIR): if not filename.endswith('_psms.txt'): continue psms = pd.read_table(os.path.join(pyp.paths.MS_SEARCHED_DIR, filename)) psms.dropna( subset=['Protein Group Accessions'], inplace=True, ) accessions.update( acc.strip() for row_str in psms['Protein Group Accessions'] for acc in row_str.split(';') ) fetch_uniprot_data(accessions)
[docs]def prefetch_all_msf_uniprot(): ''' Fetch data for all accesions found in MSF files in MS Searched directory. ''' accessions = set() for filename in os.listdir(pyp.paths.MS_SEARCHED_DIR): if not os.path.splitext(filename)[1].lower() in ['.msf']: continue msf_path = os.path.join(pyp.paths.MS_SEARCHED_DIR, filename) with sqlite3.connect(msf_path) as conn: cursor = conn.cursor() vals = cursor.execute( ''' SELECT ProteinAnnotations.Description FROM ProteinAnnotations ''' ) accessions.update( RE_DISCOVERER_ACCESSION.match(prot_string).group(1) for (prot_string,) in vals ) fetch_uniprot_data(accessions)
[docs]def get_uniprot_data(accession): ''' Get UniProt data associated with a protein. Parameters ---------- accession : str Returns ------- data : dict ''' if accession not in UNIPROT_DATA: fetch_uniprot_data([accession]) return UNIPROT_DATA.get(accession, {})