from __future__ import absolute_import, division
import collections
import os
import re
import pandas as pd
import pyproteome as pyp
def _prep_csv(data=None, postfix='table', folder_name=None, csv_name=None):
if csv_name is None:
csv_name = '{}.csv'.format(
postfix,
)
folder_name = pyp.utils.make_folder(
data=data,
folder_name=folder_name,
sub='Tables',
)
return os.path.join(folder_name, csv_name)
def _get_table_title(f=None, running_title=None):
if running_title is None:
running_title = []
if f is not None:
if 'asym_fold' in f:
running_title.append(
'Upregulated' if f['asym_fold'] > 1 else 'Downregulated'
)
if 'p' in f:
running_title.append(
'p-{:.3e}'.format(f['p'])
)
if 'group_a' in f or 'group_b' in f:
running_title.append(
'{}vs{}'.format(
f.get('group_a', ''),
f.get('group_b', ''),
)
)
return '-'.join(running_title)
[docs]def motif_table(
data, f,
p=0.05,
sort='p-value',
**kwargs
):
'''
Run a motif enrichment algorithm on a data set and display the
significantly enriched motifs.
Parameters
----------
data : :class:`pyproteome.data_sets.DataSet`
f : dict or list of dict
p : float, optional
sort : str, optional
Returns
-------
df : :class:`pandas.DataFrame`
See Also
--------
:func:`pyproteome.motifs.motif.run_motif_enrichment`
'''
hits = pyp.motifs.motif.run_motif_enrichment(
data, f,
**kwargs
)[0]
hits = hits.sort_values(
sort,
ascending=True if sort == 'p-value' else False,
)
hits = hits[
hits[
'pp-value' if kwargs.get('pp_value', False) else 'p-value'
] < p
]
return hits.style.set_table_styles([
{'selector': '*', 'props': [('font-family', 'monospace')]},
{'selector': 'th:first-child', 'props': [('display', 'none')]},
])
[docs]def changes_table(
data,
sort='p-value',
):
'''
Show a table of fold changes and p-values for each unique peptide in a data set.
Parameters
----------
data : :class:`pyproteome.data_sets.DataSet`
sort : str, optional
Returns
-------
df : :class:`pandas.DataFrame`
'''
psms = getattr(data, 'psms', data)
psms = psms[
[
'Proteins', 'Sequence', 'Modifications',
'Fold Change', 'p-value', 'Validated',
]
].copy()
psms['Sequence'] = psms['Sequence'].apply(
lambda x: '{} ({})'.format(x, x.modifications)
)
psms['Uniprot Accessions'] = psms['Proteins'].apply(
lambda x: '; '.join(x.accessions)
)
if sort == 'Fold Change':
psms['Fold Change-Sort'] = psms['Fold Change'].apply(
lambda x: max([x, 1 / x])
)
psms.sort_values('Fold Change-Sort', inplace=True, ascending=False)
psms.drop('Fold Change-Sort', axis=1, inplace=True)
else:
psms.sort_values(sort, inplace=True, ascending=True)
psms.drop('Modifications', axis=1, inplace=True)
# back_colors = {
# True: '#BBFFBB', # light green
# False: '#FFBBBB', # light red
# }
if psms.empty:
return psms
# return psms.style.apply( # Color validated rows
# lambda row: [
# 'background-color: ' + back_colors[row['Validated']]
# for _ in row
# ],
# axis=1,
# )
return psms.style.set_table_styles( # Hide index and 'Validated' columns
[
{'selector': 'th:first-child', 'props': [('display', 'none')]},
{'selector': 'td:last-child', 'props': [('display', 'none')]},
{'selector': 'th:last-child', 'props': [('display', 'none')]},
{'selector': '*', 'props': [('text-align', 'left')]},
]
)
[docs]def ptmsigdb_changes_table(
data,
sort='p-value',
folder_name=None,
csv_name=None,
):
'''
Show a table of fold changes and p-values for PTMSigDB.
Parameters
----------
data : :class:`pyproteome.data_sets.DataSet`
sort : str, optional
folder_name : str, optional
csv_name : str, optional
Returns
-------
df : :class:`pandas.DataFrame`
'''
csv_name = _prep_csv(
data=data,
folder_name=folder_name,
csv_name=csv_name,
postfix=_get_table_title(running_title=['ptmsigdb']),
)
psms = getattr(data, 'psms', data).copy()
psms = psms.dropna(subset=('Fold Change',))
psms['Protein Description'] = psms['Proteins'].apply(
lambda x: x.proteins[0].description
)
psms['Gene'] = psms['Proteins'].apply(
lambda x: x.genes[0]
)
psms['Uniprot Accession'] = psms['Proteins'].apply(
lambda x: x.accessions[0]
)
psms['All Modifications'] = psms['Modifications']
psms['Phospho Modifications'] = psms['Modifications'].apply(
lambda x: x.get_mods([(None, 'Phospho')]).__str__(prot_index=0)
)
psms = psms[
[
'Protein Description',
'Gene',
'Uniprot Accession',
'Sequence',
'All Modifications',
'Phospho Modifications',
'Fold Change',
]
].copy()
psms.sort_values('Fold Change', inplace=True, ascending=False)
if csv_name:
psms.to_csv(
csv_name,
index=False,
)
if psms.empty:
return psms
return psms.style.set_table_styles( # Hide index and 'Validated' columns
[
{'selector': 'th:first-child', 'props': [('display', 'none')]},
{'selector': '*', 'props': [('text-align', 'left')]},
]
)
def _ds_to_df(data, save_cols=None, sample_values=True):
channels = [
(name, data.channels[name])
for name in data.samples
] if sample_values else []
if save_cols is None:
save_cols = ['Fold Change', 'p-value']
else:
save_cols = [i for i in save_cols if i in data.psms.columns]
cols = [
'Proteins', 'Sequence', #'Scan',
] + save_cols + [
chan
for _, chan in channels
]
df = data.psms[cols].copy()
df.rename(
columns={
chan: name
for name, chan in channels
},
inplace=True,
)
df.insert(
1, 'Genes',
df['Proteins'].apply(
lambda x:
' / '.join(x.genes)
),
)
df.insert(
2, 'Uniprot Accessions',
df['Proteins'].apply(
lambda x:
' / '.join(x.accessions)
),
)
df.insert(
3, 'Modifications',
df['Sequence'].apply(
lambda x: str(x.modifications)
),
)
df['Sequence'] = df['Sequence'].apply(str)
# df['Scan'] = df['Scan'].apply(
# lambda x:
# ', '.join([str(i) for i in x])
# if isinstance(x, collections.Iterable) else
# str(x)
# )
if 'p-value' in cols:
df.sort_values('p-value', inplace=True, ascending=True)
return df
[docs]def write_csv(data, folder_name=None, out_name='DataSet.csv'):
'''
Write information for a single data set to a .csv file.
Sheets are populated with protein, peptide, scan, and quantification values
for all peptide-spectrum matches contained within a data set.
Parameters
----------
data : :class:`pyproteome.data_sets.DataSet`
folder_name : str, optional
out_name : str, optional
Returns
-------
path : str
Path to .xlsx file.
'''
out_name = _prep_csv(
data=None,
folder_name=folder_name,
csv_name=out_name,
)
df = _ds_to_df(data)
df.to_csv(
out_name,
index=False,
)
return out_name
[docs]def write_full_tables(
datas,
save_cols=None,
sample_values=True,
folder_name=None,
out_name='Full Data.xlsx',
):
'''
Write information for a list of data sets to sheets of a .xlsx file.
Sheets are populated with protein, peptide, scan, and quantification values
for all peptide-spectrum matches contained within a data set.
Parameters
----------
datas : list of :class:`pyproteome.data_sets.DataSet`
save_cols : list of str, optional
Extra column names to save from in each dataset.
sample_values : bool, optional
Save normalized TMT values for each sample to the output.
folder_name : str, optional
out_name : str, optional
Returns
-------
path : str
Path to .xlsx file.
'''
out_name = _prep_csv(
data=None,
folder_name=folder_name,
csv_name=out_name,
)
writer = pd.ExcelWriter(out_name, engine='xlsxwriter')
for data in datas:
df = _ds_to_df(
data,
save_cols=save_cols,
sample_values=sample_values,
)
ws_name = re.sub(
'/',
'+',
data.name,
)
df.to_excel(
writer,
sheet_name=ws_name,
index=False,
)
ws = writer.sheets[ws_name]
ws.freeze_panes(1, 0)
ws.set_column(0, 0, 60)
ws.set_column(1, 1, 15)
ws.set_column(2, 2, 15)
ws.set_column(3, 3, 20)
ws.set_column(4, 4, 20)
ws.autofilter(0, 0, df.shape[0], df.shape[1] - 1)
for ind, col in enumerate(df.columns):
args = 0, ind, df.shape[0], ind
if col.endswith('-p'):
args += {
'type': '3_color_scale',
'min_type': 'num',
'mid_type': 'num',
'max_type': 'num',
'min_value': 0,
'mid_value': 1e-4,
'max_value': 0.05,
'min_color': '#FF4444',
'mid_color': '#FF8844',
'max_color': '#FFFFFF',
},
print(col, args)
elif col.endswith('-FC'):
args += {
'type': '3_color_scale',
'min_type': 'num',
'mid_type': 'num',
'max_type': 'num',
'min_value': .25,
'mid_value': 1,
'max_value': 4,
'min_color': '#6666FF',
'mid_color': '#FFFFFF',
'max_color': '#FF6666',
},
elif col.endswith('-Corr') or col.endswith('-Correlation'):
args += {
'type': '3_color_scale',
'min_type': 'num',
'mid_type': 'num',
'max_type': 'num',
'min_value': -1,
'mid_value': 0,
'max_value': 1,
'min_color': '#6666FF',
'mid_color': '#FFFFFF',
'max_color': '#FF6666',
},
else:
continue
ws.conditional_format(*args)
writer.save()
return out_name