Source code for pyproteome.analysis.protein

from matplotlib.patches import Circle, Ellipse, Rectangle, FancyBboxPatch
import numpy as np
from matplotlib import pyplot as plt
from collections import Counter


def _get_protein_seq(slc, gene):
    for _, row in slc.iterrows():
        for protein in row['Proteins'].proteins:
            if protein.gene == gene:
                return protein.full_sequence
            

def _get_lc(
    fc, 
    p, 
    edge=False, 
    p_cutoff=1e-2,
    upper_fc_cutoff=1.05,
    lower_fc_cutoff=0.95,
):
    if p < p_cutoff:
        if fc > upper_fc_cutoff:
            return '#ff0000' if edge else '#ff8888'
        elif fc < lower_fc_cutoff:
            return '#0000ff' if edge else '#8888ff'
    return '#000000' if edge else '#888888'


def _draw_line(
    ax, 
    row, 
    col, 
    col2, 
    larrow=True, 
    rarrow=True, 
    line_color='#000000',
    row_offset=-.5,
    col_offset=-.5,
    z=0,
):
    return ax.add_artist(
        FancyBboxPatch(
            (col + col_offset, row - .5),
            col2 - col, 
            .9,
            ec=None,
            fc=line_color,
            zorder=-1,
            boxstyle="round,pad=.05",
        )
    )


[docs]def draw_protein_seq( ds, genes, max_col=50, p_cutoff=1e-2, upper_fc_cutoff=1.05, lower_fc_cutoff=0.95, missed_cleavage=1, ): ''' Generate a figure showing all peptides in a data set mapping to the full sequence of their respective proteins. Peptide differential regulation is indicated by bars for full peptide sequences and circles indicating phosphorylated residues. Bars and circles are colored red for upregulation and blue for downregulation. Dark grey bars indicate an unmodified peptide with no change. Light grey bars indicate that only the phosphorylated version of that peptide was identified. Parameters ---------- ds : :class:`pyproteome.data_sets.data_set.DataSet` genes : list of str max_col : int, optional p_cutoff : float, optional upper_fc_cutoff : float, optional lower_fc_cutoff : float, optional missed_cleavage : int, optional Returns ------- figs : list of :class:`matplotlib.figure.Figure` Examples -------- >>> figs = analysis.protein.draw_protein_seq( ... ds, ['Mapt'] ... ) ''' ds = ds.filter(protein=genes) ds = ds.filter(missed_cleavage=missed_cleavage) max_col = 50 print(genes) # mod_types = sorted(set([ # mod.mod_type # for seq in ds['Sequence'] # for mod in seq.modifications # if 'TMT' not in mod.mod_type # ])) # colors = sns.color_palette( # "hls", len(mod_types), # ).as_hex() mod_types = ['Phospho'] colors = ['#db5f57'] figs = [] for gene in genes: slc = ds[ds['Proteins'] == gene] try: slc['Sort'] = slc.apply( lambda x: ( len(x['Modifications'].get_mods(mod_types)) > 0, len(x['Sequence']), x['p-value'], ), axis=1, ) slc = slc.sort_values('Sort', ascending=False) except: continue # display(slc[['Proteins', 'Sequence', 'Fold Change', 'p-value']]) prot_seq = _get_protein_seq(slc, gene) row_max = int(np.ceil(len(prot_seq) / max_col)) fig_x, fig_y = row_max / 4, max_col / 5 f, ax = plt.subplots(figsize=(fig_y, fig_x), dpi=300) figs.append(f) ax.set_title(gene) ax.set_xlim(left=-1, right=max_col) ax.set_ylim(bottom=row_max - .5, top=-1) ax.get_xaxis().set_visible(False) # ax.get_yaxis().set_visible(False) ax.set_yticks(list(range(row_max))) ax.set_yticklabels([i * max_col + 1 for i in ax.get_yticks()]) row, col = 0, 0 for ind, letter in enumerate(prot_seq): ax.text( x=col, y=row, s=letter, ha='center', va='center', family='monospace', zorder=20, ) col += 1 if col >= max_col: col = 0 row += 1 count = Counter() for _, pep in slc.iterrows(): seq = pep['Sequence'] match = [i for i in seq.protein_matches if i.protein.gene == gene][0] fc_lc = _get_lc( pep['Fold Change'], pep['p-value'], p_cutoff=p_cutoff, upper_fc_cutoff=upper_fc_cutoff, lower_fc_cutoff=lower_fc_cutoff, ) ec_lc = _get_lc( pep['Fold Change'], pep['p-value'], edge=True, p_cutoff=p_cutoff, upper_fc_cutoff=upper_fc_cutoff, lower_fc_cutoff=lower_fc_cutoff, ) mods = seq.modifications.get_mods(mod_types) # print(seq.pep_seq, match.rel_pos, match.exact, str(seq)) row = match.rel_pos // max_col col = match.rel_pos % max_col col2 = (col + len(seq.pep_seq)) count = Counter(seq.pep_seq) ro = -.4 - .1 * count[row, col] ro = -.4 - .1 * (count['K'] + count['R']) # Circle phosphosites for mod in mods: letter = mod.letter ind = mod.rel_pos + match.rel_pos mod_row = ind // max_col mod_col = ind % max_col ax.text( x=mod_col, y=mod_row, s=letter, ha='center', va='center', color='k', family='monospace', zorder=20, ) if ec_lc not in ['#888888', '#000000']: print(seq.pep_seq, mod, ind + 1) ax.add_artist( Ellipse( (mod_col, mod_row - .04), 1, 1, ec=ec_lc, fc='#ffffff', lw=1.5, zorder={ '#888888': 11, '#000000': 11, }.get(ec_lc, 12), ), ) if len(mods) > 0: fc_lc = '#cccccc' count[row, col] += 1 # Draw peptide lines rarrow = True # print(seq, fc_lc) while col2 > 0: _draw_line( ax, row, col, min([col2, max_col]), rarrow=rarrow, larrow=col2 < max_col, line_color=fc_lc, z={ '#cccccc': 1, '#888888': 2, }.get(fc_lc, 3), row_offset=ro, ) row += 1 col = 0 col2 -= max_col rarrow = False return figs