Source code for pyproteome.pride

'''
This module provides functionality for accessing public data through
PRIDE PRoteomics IDEntifications (PRIDE) / Proteome Xchange.
'''

import os
# XXX: This should be a safer alternative package. Otherwise users could be
# vulnerable to a MITM attack
import xml.etree.ElementTree as ET

import requests


META_DATA_URL = (
    'http://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={}&'
    'outputMode=XML&test=no'
)


[docs]def list_data_set(accession):
    '''
    Lists files contained in a deposition on PRIDE.

    Information is fetched from pride.META_DATA_URL.

    Parameters
    ----------
    accession : str

    Returns
    -------
    info_list : list of :class:`xml.etree.ElementTree`
        Information on files available in a repository.

    Examples
    --------
    >>> lst = pride.list_data_set('PXD003660')
    >>> lst[0].get('name')
    '20140524_MCF10A_E20VR1_ETP_TMT10.raw'
    >>> lst[0].find('cvParam').get('value')
    'ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2016/06/PXD003660/20140524_MCF10A_E20VR1_ETP_TMT10.raw'
    '''
    assert accession.startswith('PXD')

    proj_id = int(accession[3:])

    # Fetch xml file using requests
    meta_data = requests.get(META_DATA_URL.format(proj_id))
    meta_data.raise_for_status()

    root = ET.fromstring(meta_data.text)

    return [
        i
        for i in root.findall('DatasetFileList/DatasetFile')
    ]


[docs]def fetch_data_set(accession, files=None):
    '''
    Fetches files from a deposition on PRIDE.

    Parameters
    ----------
    accession : str
        A PRIDE accession ID. i.e. 'PXD001038'
    files : dict of str, str
        Download individual files to a specific location. By default, this
        function downloads all files to the current working directory.

    Returns
    -------
    file_list : list of str
        Files downloaded from a repository.

    Examples
    --------
    >>> pride.fetch_data_set(
    ...     'PXD001038',
    ...     files={'HJ070512_OCTFF_B2_All5Fractions_PeptideSummary.zip': '.'},
    ... )
    ['HJ070512_OCTFF_B2_All5Fractions_PeptideSummary.zip']
    '''
    ds = list_data_set(accession)
    ret = []

    for file_root in ds:
        name = file_root.get('name')

        if files and name not in files:
            continue

        if isinstance(files, dict) and name in files:
            folder = files[name]
        else:
            folder = os.getcwd()

        out_path = os.path.join(folder, name)
        file_url = file_root.find('cvParam').get('value')

        # Requests cannot fetch FTP files
        if file_url.startswith('ftp://'):
            file_url = 'http://{}'.format(file_url[len('ftp://'):])

        response = requests.get(file_url, stream=True)
        response.raise_for_status()

        with open(out_path, mode='wb') as f:
            for block in response.iter_content(1024):
                f.write(block)

        ret.append(name)

    return ret