Source code for pyproteome.pride
'''
This module provides functionality for accessing public data through
PRIDE PRoteomics IDEntifications (PRIDE) / Proteome Xchange.
'''
import os
# XXX: This should be a safer alternative package. Otherwise users could be
# vulnerable to a MITM attack
import xml.etree.ElementTree as ET
import requests
META_DATA_URL = (
'http://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={}&'
'outputMode=XML&test=no'
)
[docs]def list_data_set(accession):
'''
Lists files contained in a deposition on PRIDE.
Information is fetched from pride.META_DATA_URL.
Parameters
----------
accession : str
Returns
-------
info_list : list of :class:`xml.etree.ElementTree`
Information on files available in a repository.
Examples
--------
>>> lst = pride.list_data_set('PXD003660')
>>> lst[0].get('name')
'20140524_MCF10A_E20VR1_ETP_TMT10.raw'
>>> lst[0].find('cvParam').get('value')
'ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2016/06/PXD003660/20140524_MCF10A_E20VR1_ETP_TMT10.raw'
'''
assert accession.startswith('PXD')
proj_id = int(accession[3:])
# Fetch xml file using requests
meta_data = requests.get(META_DATA_URL.format(proj_id))
meta_data.raise_for_status()
root = ET.fromstring(meta_data.text)
return [
i
for i in root.findall('DatasetFileList/DatasetFile')
]
[docs]def fetch_data_set(accession, files=None):
'''
Fetches files from a deposition on PRIDE.
Parameters
----------
accession : str
A PRIDE accession ID. i.e. 'PXD001038'
files : dict of str, str
Download individual files to a specific location. By default, this
function downloads all files to the current working directory.
Returns
-------
file_list : list of str
Files downloaded from a repository.
Examples
--------
>>> pride.fetch_data_set(
... 'PXD001038',
... files={'HJ070512_OCTFF_B2_All5Fractions_PeptideSummary.zip': '.'},
... )
['HJ070512_OCTFF_B2_All5Fractions_PeptideSummary.zip']
'''
ds = list_data_set(accession)
ret = []
for file_root in ds:
name = file_root.get('name')
if files and name not in files:
continue
if isinstance(files, dict) and name in files:
folder = files[name]
else:
folder = os.getcwd()
out_path = os.path.join(folder, name)
file_url = file_root.find('cvParam').get('value')
# Requests cannot fetch FTP files
if file_url.startswith('ftp://'):
file_url = 'http://{}'.format(file_url[len('ftp://'):])
response = requests.get(file_url, stream=True)
response.raise_for_status()
with open(out_path, mode='wb') as f:
for block in response.iter_content(1024):
f.write(block)
ret.append(name)
return ret