Source code for pyproteome.analysis.tables


from __future__ import absolute_import, division

import collections
import os
import re

import pandas as pd

import pyproteome as pyp


def _prep_csv(data=None, postfix='table', folder_name=None, csv_name=None):
    if csv_name is None:
        csv_name = '{}.csv'.format(
            postfix,
        )

    folder_name = pyp.utils.make_folder(
        data=data,
        folder_name=folder_name,
        sub='Tables',
    )

    return os.path.join(folder_name, csv_name)


def _get_table_title(f=None, running_title=None):
    if running_title is None:
        running_title = []

    if f is not None:
        if 'asym_fold' in f:
            running_title.append(
                'Upregulated' if f['asym_fold'] > 1 else 'Downregulated'
            )

        if 'p' in f:
            running_title.append(
                'p-{:.3e}'.format(f['p'])
            )

        if 'group_a' in f or 'group_b' in f:
            running_title.append(
                '{}vs{}'.format(
                    f.get('group_a', ''),
                    f.get('group_b', ''),
                )
            )

    return '-'.join(running_title)


[docs]def motif_table(
    data, f,
    p=0.05,
    sort='p-value',
    **kwargs
):
    '''
    Run a motif enrichment algorithm on a data set and display the
    significantly enriched motifs.

    Parameters
    ----------
    data : :class:`pyproteome.data_sets.DataSet`
    f : dict or list of dict
    p : float, optional
    sort : str, optional

    Returns
    -------
    df : :class:`pandas.DataFrame`

    See Also
    --------
    :func:`pyproteome.motifs.motif.run_motif_enrichment`
    '''
    hits = pyp.motifs.motif.run_motif_enrichment(
        data, f,
        **kwargs
    )[0]

    hits = hits.sort_values(
        sort,
        ascending=True if sort == 'p-value' else False,
    )

    hits = hits[
        hits[
            'pp-value' if kwargs.get('pp_value', False) else 'p-value'
        ] < p
    ]

    return hits.style.set_table_styles([
        {'selector': '*', 'props': [('font-family', 'monospace')]},
        {'selector': 'th:first-child', 'props': [('display', 'none')]},
    ])


[docs]def changes_table(
    data,
    sort='p-value',
):
    '''
    Show a table of fold changes and p-values for each unique peptide in a data set.

    Parameters
    ----------
    data : :class:`pyproteome.data_sets.DataSet`
    sort : str, optional

    Returns
    -------
    df : :class:`pandas.DataFrame`
    '''
    psms = getattr(data, 'psms', data)

    psms = psms[
        [
            'Proteins', 'Sequence', 'Modifications',
            'Fold Change', 'p-value', 'Validated',
        ]
    ].copy()
    psms['Sequence'] = psms['Sequence'].apply(
        lambda x: '{} ({})'.format(x, x.modifications)
    )
    psms['Uniprot Accessions'] = psms['Proteins'].apply(
        lambda x: '; '.join(x.accessions)
    )

    if sort == 'Fold Change':
        psms['Fold Change-Sort'] = psms['Fold Change'].apply(
            lambda x: max([x, 1 / x])
        )
        psms.sort_values('Fold Change-Sort', inplace=True, ascending=False)
        psms.drop('Fold Change-Sort', axis=1, inplace=True)
    else:
        psms.sort_values(sort, inplace=True, ascending=True)

    psms.drop('Modifications', axis=1, inplace=True)

    # back_colors = {
    #     True: '#BBFFBB',  # light green
    #     False: '#FFBBBB',  # light red
    # }

    if psms.empty:
        return psms

    # return psms.style.apply(  # Color validated rows
    #     lambda row: [
    #         'background-color: ' + back_colors[row['Validated']]
    #         for _ in row
    #     ],
    #     axis=1,
    # )
    return psms.style.set_table_styles(  # Hide index and 'Validated' columns
        [
            {'selector': 'th:first-child', 'props': [('display', 'none')]},
            {'selector': 'td:last-child', 'props': [('display', 'none')]},
            {'selector': 'th:last-child', 'props': [('display', 'none')]},
            {'selector': '*', 'props': [('text-align', 'left')]},
        ]
    )


[docs]def ptmsigdb_changes_table(
    data,
    sort='p-value',
    folder_name=None,
    csv_name=None,
):
    '''
    Show a table of fold changes and p-values for PTMSigDB.

    Parameters
    ----------
    data : :class:`pyproteome.data_sets.DataSet`
    sort : str, optional
    folder_name : str, optional
    csv_name : str, optional

    Returns
    -------
    df : :class:`pandas.DataFrame`
    '''
    csv_name = _prep_csv(
        data=data,
        folder_name=folder_name,
        csv_name=csv_name,
        postfix=_get_table_title(running_title=['ptmsigdb']),
    )

    psms = getattr(data, 'psms', data).copy()
    psms = psms.dropna(subset=('Fold Change',))

    psms['Protein Description'] = psms['Proteins'].apply(
        lambda x: x.proteins[0].description
    )
    psms['Gene'] = psms['Proteins'].apply(
        lambda x: x.genes[0]
    )
    psms['Uniprot Accession'] = psms['Proteins'].apply(
        lambda x: x.accessions[0]
    )
    psms['All Modifications'] = psms['Modifications']
    psms['Phospho Modifications'] = psms['Modifications'].apply(
        lambda x: x.get_mods([(None, 'Phospho')]).__str__(prot_index=0)
    )

    psms = psms[
        [
            'Protein Description',
            'Gene',
            'Uniprot Accession',
            'Sequence',
            'All Modifications',
            'Phospho Modifications',
            'Fold Change',
        ]
    ].copy()
    psms.sort_values('Fold Change', inplace=True, ascending=False)

    if csv_name:
        psms.to_csv(
            csv_name,
            index=False,
        )

    if psms.empty:
        return psms

    return psms.style.set_table_styles(  # Hide index and 'Validated' columns
        [
            {'selector': 'th:first-child', 'props': [('display', 'none')]},
            {'selector': '*', 'props': [('text-align', 'left')]},
        ]
    )


def _ds_to_df(data, save_cols=None, sample_values=True):
    channels = [
        (name, data.channels[name])
        for name in data.samples
    ] if sample_values else []
    
    if save_cols is None:
        save_cols = ['Fold Change', 'p-value']
    else:
        save_cols = [i for i in save_cols if i in data.psms.columns]
    
    cols = [
        'Proteins', 'Sequence', #'Scan',
    ] + save_cols + [
        chan
        for _, chan in channels
    ]
    df = data.psms[cols].copy()

    df.rename(
        columns={
            chan: name
            for name, chan in channels
        },
        inplace=True,
    )
    df.insert(
        1, 'Genes',
        df['Proteins'].apply(
            lambda x:
            ' / '.join(x.genes)
        ),
    )
    df.insert(
        2, 'Uniprot Accessions',
        df['Proteins'].apply(
            lambda x:
            ' / '.join(x.accessions)
        ),
    )
    df.insert(
        3, 'Modifications',
        df['Sequence'].apply(
            lambda x: str(x.modifications)
        ),
    )
    df['Sequence'] = df['Sequence'].apply(str)
    # df['Scan'] = df['Scan'].apply(
    #     lambda x:
    #     ', '.join([str(i) for i in x])
    #     if isinstance(x, collections.Iterable) else
    #     str(x)
    # )

    if 'p-value' in cols:
        df.sort_values('p-value', inplace=True, ascending=True)

    return df


[docs]def write_csv(data, folder_name=None, out_name='DataSet.csv'):
    '''
    Write information for a single data set to a .csv file.

    Sheets are populated with protein, peptide, scan, and quantification values
    for all peptide-spectrum matches contained within a data set.

    Parameters
    ----------
    data : :class:`pyproteome.data_sets.DataSet`
    folder_name : str, optional
    out_name : str, optional

    Returns
    -------
    path : str
        Path to .xlsx file.
    '''

    out_name = _prep_csv(
        data=None,
        folder_name=folder_name,
        csv_name=out_name,
    )

    df = _ds_to_df(data)

    df.to_csv(
        out_name,
        index=False,
    )

    return out_name


[docs]def write_full_tables(
    datas, 
    save_cols=None, 
    sample_values=True,
    folder_name=None, 
    out_name='Full Data.xlsx', 
):
    '''
    Write information for a list of data sets to sheets of a .xlsx file.

    Sheets are populated with protein, peptide, scan, and quantification values
    for all peptide-spectrum matches contained within a data set.

    Parameters
    ----------
    datas : list of :class:`pyproteome.data_sets.DataSet`
    save_cols : list of str, optional
        Extra column names to save from in each dataset.
    sample_values : bool, optional
        Save normalized TMT values for each sample to the output.
    folder_name : str, optional
    out_name : str, optional

    Returns
    -------
    path : str
        Path to .xlsx file.
    '''

    out_name = _prep_csv(
        data=None,
        folder_name=folder_name,
        csv_name=out_name,
    )

    writer = pd.ExcelWriter(out_name, engine='xlsxwriter')

    for data in datas:
        df = _ds_to_df(
            data, 
            save_cols=save_cols,
            sample_values=sample_values,
        )

        ws_name = re.sub(
            '/',
            '+',
            data.name,
        )
        df.to_excel(
            writer,
            sheet_name=ws_name,
            index=False,
        )

        ws = writer.sheets[ws_name]
        
        ws.freeze_panes(1, 0)
        
        ws.set_column(0, 0, 60)
        ws.set_column(1, 1, 15)
        ws.set_column(2, 2, 15)
        ws.set_column(3, 3, 20)
        ws.set_column(4, 4, 20)

        ws.autofilter(0, 0, df.shape[0], df.shape[1] - 1)

        for ind, col in enumerate(df.columns):
            args = 0, ind, df.shape[0], ind

            if col.endswith('-p'):
                args += {
                    'type': '3_color_scale', 
                    'min_type': 'num',
                    'mid_type': 'num',
                    'max_type': 'num',
                    'min_value': 0, 
                    'mid_value': 1e-4,
                    'max_value': 0.05,
                    'min_color': '#FF4444',
                    'mid_color': '#FF8844',
                    'max_color': '#FFFFFF',
                },
                print(col, args)
            elif col.endswith('-FC'):
                args += {
                    'type': '3_color_scale', 
                    'min_type': 'num',
                    'mid_type': 'num',
                    'max_type': 'num',
                    'min_value': .25, 
                    'mid_value': 1,
                    'max_value': 4,
                    'min_color': '#6666FF',
                    'mid_color': '#FFFFFF',
                    'max_color': '#FF6666',
                },
            elif col.endswith('-Corr') or col.endswith('-Correlation'):
                args += {
                    'type': '3_color_scale', 
                    'min_type': 'num',
                    'mid_type': 'num',
                    'max_type': 'num',
                    'min_value': -1, 
                    'mid_value': 0,
                    'max_value': 1,
                    'min_color': '#6666FF',
                    'mid_color': '#FFFFFF',
                    'max_color': '#FF6666',
                },
            else:
                continue

            ws.conditional_format(*args)

    writer.save()

    return out_name