Source code for pyproteome.motifs.neighborhood


from matplotlib import pyplot as plt
from scipy import stats

from . import motif, plogo


[docs]def enriched_neighborhood( data, f, residues, nmer_length=7, count_cutoff=2, mods=None, ): ''' Calculates the hypergeometric enrichment value for the number of adjacent residues within a given window around all modification sites in a data set. Parameters ---------- data : :class:`pyproteome.data_sets.data_set.DataSet` f : dict or list of dict residues : list of str nmer_length : int, optional count_cutoff : int, optional mods : str or list of str Returns ------- f : :class:`matplotlib.figure.Figure` ax : :class:`matplotlib.axes.Axes` pval : float P-value, calculated with :class:`scipy.stats.hypergeom`. K : int Number of sequences with # residues > count_cutoff in background list. N : int Size of the background list of sequences. k : int Number of sequences with # residues > count_cutoff in foreground list. n : int Size of the foreground list of sequences. ''' if mods is None: mods = [(None, 'Phospho')] background = motif.generate_n_mers( data['Sequence'], mods=mods, n=nmer_length, all_matches=False, ) foreground = motif.generate_n_mers( data.filter(f)['Sequence'], mods=mods, n=nmer_length, all_matches=False, ) N = len(background) K = len([ i for i in background if sum(i.count(j) for j in residues) >= count_cutoff ]) n = len(foreground) k = len([ i for i in foreground if sum(i.count(j) for j in residues) >= count_cutoff ]) pval = stats.hypergeom( N, K, n, ).sf( min([k, n]) - 1 ) fig, ax = plt.subplots(figsize=(4, 4)) if background: ax.hist( [ sum(i.count(j) for j in residues) for i in background ], density=True, alpha=0.5, color='green', bins=range(0, nmer_length, 1), label='background', ) if foreground: ax.hist( [ sum(i.count(j) for j in residues) for i in foreground ], density=True, alpha=0.7, color='orange', bins=range(0, nmer_length, 1), label=plogo.format_title(f=f), ) ax.legend() ax.set_ylabel('Frequency') return fig, ax, pval, K, N, k, n