Source code for pyproteome.motifs.neighborhood


from matplotlib import pyplot as plt
from scipy import stats

from . import motif, plogo


[docs]def enriched_neighborhood(
    data,
    f,
    residues,
    nmer_length=7,
    count_cutoff=2,
    mods=None,
):
    '''
    Calculates the hypergeometric enrichment value for the number of
    adjacent residues within a given window around all modification sites
    in a data set.

    Parameters
    ----------
    data : :class:`pyproteome.data_sets.data_set.DataSet`
    f : dict or list of dict
    residues : list of str
    nmer_length : int, optional
    count_cutoff : int, optional
    mods : str or list of str

    Returns
    -------
    f : :class:`matplotlib.figure.Figure`
    ax : :class:`matplotlib.axes.Axes`
    pval : float
        P-value, calculated with :class:`scipy.stats.hypergeom`.
    K : int
        Number of sequences with # residues > count_cutoff in background list.
    N : int
        Size of the background list of sequences.
    k : int
        Number of sequences with # residues > count_cutoff in foreground list.
    n : int
        Size of the foreground list of sequences.
    '''
    if mods is None:
        mods = [(None, 'Phospho')]
    background = motif.generate_n_mers(
        data['Sequence'],
        mods=mods,
        n=nmer_length,
        all_matches=False,
    )
    foreground = motif.generate_n_mers(
        data.filter(f)['Sequence'],
        mods=mods,
        n=nmer_length,
        all_matches=False,
    )

    N = len(background)
    K = len([
        i
        for i in background
        if sum(i.count(j) for j in residues) >= count_cutoff
    ])
    n = len(foreground)
    k = len([
        i
        for i in foreground
        if sum(i.count(j) for j in residues) >= count_cutoff
    ])
    pval = stats.hypergeom(
        N,
        K,
        n,
    ).sf(
        min([k, n]) - 1
    )

    fig, ax = plt.subplots(figsize=(4, 4))

    if background:
        ax.hist(
            [
                sum(i.count(j) for j in residues)
                for i in background
            ],
            density=True,
            alpha=0.5,
            color='green',
            bins=range(0, nmer_length, 1),
            label='background',
        )

    if foreground:
        ax.hist(
            [
                sum(i.count(j) for j in residues)
                for i in foreground
            ],
            density=True,
            alpha=0.7,
            color='orange',
            bins=range(0, nmer_length, 1),
            label=plogo.format_title(f=f),
        )
        ax.legend()

    ax.set_ylabel('Frequency')

    return fig, ax, pval, K, N, k, n