Source code for pyproteome.analysis.correlation

# -*- coding: UTF-8 -*-
'''
This module provides functionality for data set analysis.

Functions include volcano plots, sorted tables, and plotting sequence levels.
'''

from __future__ import absolute_import, division

import logging
import warnings

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import pyproteome as pyp


LOGGER = logging.getLogger('pyproteome.correlation')


[docs]def correlate_data_sets(
    data1, data2,
    adjust=True,
    label_cutoff=1.5,
    show_labels=False,
    show_title=True,
    ax=None,
):
    '''
    Plot the correlation between peptides levels in two different data sets.

    Parameters
    ----------
    data1 : :class:`pyproteome.data_sets.DataSet`
    data2 : :class:`pyproteome.data_sets.DataSet`
    filename : str, optional
    '''
    LOGGER.info('Plotting dataset correlation')

    merged = pd.merge(
        data1.psms, data2.psms,
        on='Sequence',
    ).dropna(subset=('Fold Change_x', 'Fold Change_y'))

    if ax is None:
        f, ax = plt.subplots(
            figsize=(4, 3),
        )
    else:
        f = ax.get_figure()

    merged['Fold Change_x'] = merged['Fold Change_x'].apply(np.log2)
    merged['Fold Change_y'] = merged['Fold Change_y'].apply(np.log2)

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', category=FutureWarning)
        sns.regplot(
            x='Fold Change_x',
            y='Fold Change_y',
            data=merged,
            ax=ax,
            scatter_kws={
                's': 2,
            },
        )

    label_cutoff = np.log2(label_cutoff)

    if show_labels:
        texts = []

        for _, row in merged.iterrows():
            x = row['Fold Change_x']
            y = row['Fold Change_y']
            ratio = x - y

            if ratio < label_cutoff and ratio > - label_cutoff:
                continue

            txt = ' / '.join(row['Proteins_x'].genes)
            txt = txt[:20] + ('...' if len(txt) > 20 else '')

            text = ax.text(
                x=x,
                y=y,
                s=txt,
                bbox=dict(
                    color='lightgreen' if ratio < 0 else 'pink',
                    alpha=0.8,
                ),
            )

            texts.append(text)

        if adjust and texts:
            pyp.utils.adjust_text(
                texts=texts,
                ax=ax,
                lim=400,
                force_text=0.1,
                force_points=0.1,
                arrowprops=dict(arrowstyle='->', relpos=(0, 0), lw=1),
                only_move={
                    'points': 'y',
                    'text': 'xy',
                }
            )

    min_x = merged['Fold Change_x'].min()
    max_x = merged['Fold Change_x'].max()
    min_y = merged['Fold Change_y'].min()
    max_y = merged['Fold Change_y'].max()

    min_xy = min([min_x, min_y])
    max_xy = max([max_x, max_y])

    ax.plot([min_xy, max_xy], [min_xy, max_xy], '--', color='k')
    ax.set_xlim(left=min_xy - .5, right=max_xy + .5)
    ax.set_ylim(bottom=min_xy - .5, top=max_xy + .5)

    name1 = data1.name
    name2 = data2.name

    ax.set_xlabel('Fold Change -- {}'.format(name1))
    ax.set_ylabel('Fold Change -- {}'.format(name2))

    ax.set_xticklabels(
        ['{:.2f}'.format(i) for i in np.power(2, ax.get_xticks())],
    )
    ax.set_yticklabels(
        ['{:.2f}'.format(i) for i in np.power(2, ax.get_yticks())],
    )

    if show_title:
        pear_corr = merged['Fold Change_x'].corr(
            merged['Fold Change_y'],
            method='pearson',
        )
        # spear_corr = merged['Fold Change_x'].corr(
        #     merged['Fold Change_y'],
        #     method='spearman',
        # )

        ax.set_title(
            (
                r'Pearson\'s: $\rho$={:.2f}'
                # r'Spearman\'s: $\rho$={:.2f}'
            ).format(pear_corr)
        )

    return f


def _scatter_plots(
    cp, signal,
    data_chans, signal_chans, signal_groups,
    scatter_cols=4,
    xlabel='',
    scatter_colors=None,
    scatter_symbols=None,
):
    scatter_colors = scatter_colors or {}
    scatter_symbols = scatter_symbols or {}

    if cp.shape[0] < 1:
        return None

    df = pd.DataFrame(
        [
            (
                row,
                str(row['Sequence']),
                signal[sig_chan],
                row[data_chan],
                sig_group,
                scatter_colors.get(sig_group, 'black'),
            )
            for _, row in cp.psms.sort_values('Correlation').iterrows()
            for data_chan, sig_chan, sig_group in zip(
                data_chans, signal_chans, signal_groups,
            )
        ],
        columns=('row', 'sequence', 'x', 'y', 'group', 'color'),
    )
    g = sns.lmplot(
        x='x',
        y='y',
        hue='group',
        col='sequence',
        col_wrap=scatter_cols,
        data=df,
        sharey=False,
        fit_reg=False,
    )
    for ax in g.axes:
        seq = ax.get_title().split('=', 1)[1].strip()
        df_cp = df[df['sequence'] == seq]
        row = df_cp['row'].iloc[0]

        row_title = ' / '.join(str(i.gene) for i in row['Proteins'])
        row_seq = str(row['Sequence'])
        row_mods = str(row['Modifications'].get_mods([(None, 'Phospho')]))

        sns.regplot(
            x='x',
            y='y',
            data=df_cp,
            scatter=False,
            ax=ax,
        )

        ax.set_title(
            row_title[:20] + ('...' if len(row_title) > 20 else ''),
            # fontsize=28,
            # fontweight='bold',
        )
        # ax.set_xlabel(
        #     '{}\n$\\rho = {:.2f}$'.format(
        #         xlabel,
        #         row['Correlation'],
        #     ),
        #     fontsize=22,
        # )
        ax.set_ylabel(
            row_seq[:20] + (
                '...' if len(row_seq) > 20 else ''
            ) + (
                '\n({})'.format(
                    row_mods[:20] + ('...' if len(row_mods) > 20 else '')
                ) if row_mods else ''
            ),
            # fontsize=20,
        )

        # for tick in ax.xaxis.get_major_ticks() + ax.yaxis.get_major_ticks():
        #     tick.label.set_fontsize(20)

        ax.legend(loc='best')

    return g


def _remove_lesser_dups(labels, compress_sym=False):
    new_labels = []
    labels = list(labels)

    for index, (x, y, label) in enumerate(labels):
        for o_index, (o_x, o_y, o_label) in enumerate(labels):
            if index == o_index:
                continue
            if label != o_label:
                continue
            if not compress_sym and (y < 0) != (y < 0):
                continue

            mul = 1 if y >= 0 else -1
            o_mul = 1 if o_y >= 0 else -1

            if (
                mul * y < o_mul * o_y
            ) or (
                (
                    mul * y <= o_mul * o_y
                ) and
                index < o_index
            ):
                break
        else:
            new_labels.append((x, y, label))

    return new_labels


[docs]def correlate_signal(
    data,
    signal,
    corr_cutoff=0.8,
    scatter_cols=4,
    options=None,
    title=None,
    show_duplicates=False,
    scatter_colors=None,
    scatter_symbols=None,
    show_scatter=True,
    ax=None,
    xlabel='',
):
    '''
    Calculate the correlation between levels of each peptide in a data set and
    a given signal variable.

    Parameters
    ----------
    data : :class:`pyproteome.data_sets.DataSet`
    signal : :class:`pandas.Series`

    Returns
    -------
    f_corr : :class:`matplotlib.figure.Figure`
    f_scatter : :class:`matplotlib.figure.Figure`
    '''
    options = options or {}

    highlight = options.get('highlight', {})
    hide = options.get('hide', {})
    edgecolors = options.get('edgecolors', {})
    rename = options.get('rename', {})
    scatter_colors = options.get('scatter_colors', {})
    scatter_symbols = options.get('scatter_symbols', {})

    if title is None:
        title = data.name

    cp = data.copy()
    cp = cp.rename_channels()

    cols = getattr(signal, 'columns', getattr(signal, 'index', []))

    signal_groups = [
        label
        for label, group in cp.groups.items()
        for chan in group
        if chan in cp.channels.keys() and
        chan in cols
    ]

    signal_chans = [
        chan
        for group in cp.groups.values()
        for chan in group
        if chan in cp.channels.keys() and
        chan in cols
    ]
    data_chans = [
        cp.channels[chan]
        for chan in signal_chans
    ]

    cp.psms['Correlation'] = cp.psms.apply(
        lambda row:
        signal[signal_chans].corr(
            pd.to_numeric(row[data_chans]),
            method='spearman',
            min_periods=5,
        ),
        axis=1,
    ).replace([np.inf, -np.inf], np.nan)
    cp.psms = cp.psms.dropna(
        subset=['Correlation'],
        how='any',
    )

    if ax is None:
        f_corr, ax = plt.subplots(figsize=(12, 10))

    x, y, colors = [], [], []
    sig_x, sig_y, sig_labels = [], [], []

    for index, (_, row) in enumerate(cp.psms.iterrows()):
        x.append(index)
        y.append(row['Correlation'])

        sig = abs(row['Correlation']) >= corr_cutoff

        if sig:
            sig_x.append(index)
            sig_y.append(row['Correlation'])
            sig_labels.append(' / '.join(row['Proteins'].genes))

        colors.append(
            'blue'
            if sig else
            '{:.2f}'.format(
                max([len(row[data_chans].dropna()) / len(data_chans) - .25, 0])
            )
        )

    ax.scatter(x, y, c=colors)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    labels = zip(sig_x, sig_y, sig_labels)

    if not show_duplicates:
        labels = _remove_lesser_dups(labels)

    labels = sorted(labels, key=lambda x: x[1], reverse=True)[:100]
    LOGGER.info('Showing names for {} genes'.format(len(labels)))

    texts = []

    for xs, ys, txt in labels:
        if any(i.strip() in hide for i in txt.split('/')):
            continue

        txt = rename.get(txt, txt)

        texts.append(
            ax.text(
                x=xs,
                y=ys,
                s=txt[:20] + ('...' if len(txt) > 20 else ''),
                fontsize=20 if txt in highlight else 16,
                bbox=dict(
                    facecolor='lightgreen' if xs > 0 else 'pink',
                    alpha=1,
                    linewidth=0.5 if txt not in edgecolors else 3,
                    edgecolor=edgecolors.get(txt, 'black'),
                    boxstyle='round',
                ),
            )
        )

    if texts:
        pyp.utils.adjust_text(
            texts=texts,
            ax=ax,
            lim=400,
            force_text=0.3,
            force_points=0.01,
            arrowprops=dict(arrowstyle='->', relpos=(0, 0), lw=1),
            only_move={
                'points': 'y',
                'text': 'xy',
            }
        )

    ax.set_xlabel(
        'Index',
    )
    # ax.set_yticklabels(
    #     '{:.3}'.format(i)
    #     for i in np.power(1/10, ax.get_yticks())
    # )
    ax.set_ylabel(
        'Correlation',
    )

    if title:
        ax.set_title(title)

    cp.psms = cp.psms[cp.psms['Correlation'].apply(abs) >= corr_cutoff]

    f_scatter = None

    if show_scatter:
        f_scatter = _scatter_plots(
            cp, signal, data_chans, signal_chans, signal_groups,
            scatter_cols=scatter_cols,
            scatter_colors=scatter_colors,
            scatter_symbols=scatter_symbols,
        )

    return ax.get_figure(), f_scatter