Source code for pyproteome.data_sets.data_set

'''
This module provides functionality for manipulating proteomics data sets.

Functionality includes merging data sets and interfacing with attributes in a
structured format.
'''

# Built-ins
from __future__ import absolute_import, division

from collections import OrderedDict
import copy
import logging
import os
import re
import warnings
from itertools import chain
from functools import partial

# Core data analysis libraries
import pandas as pd
import pingouin as pg
import numpy as np
import numpy.ma as ma
from scipy.stats import ttest_ind, pearsonr, spearmanr, variation

from . import modification, protein, sequence, constand

import pyproteome as pyp


LOGGER = logging.getLogger('pyproteome.data_sets')

#:
DEFAULT_FILTER_BAD = dict(
    ion_score=15,
    isolation=30,
    median_quant=1.5e3,
    q=1e-2,
)
'''
Default parameters for filtering data sets.

Selects all ions with an ion score > 15, isolation interference < 50,
median quantification signal > 1e3, and optional false-discovery q-value <
0.05.
'''

DATA_SET_COLS = [
    'Proteins',
    'Sequence',
    'Modifications',
    'Validated',
    'Confidence Level',
    'Ion Score',
    'q-value',
    'Isolation Interference',
    'Missed Cleavages',
    'Ambiguous',
    'Charges',
    'Masses',
    'RTs',
    'Intensities',
    'Raw Paths',
    'Scan Paths',
    'Scan',
    'Fold Change',
    'p-value',
]
'''
Columns available in DataSet.psms.

Note that this does not include columns for quantification or weights.
'''


[docs]class DataSet: ''' Class that encompasses a proteomics data set. Data sets can be initialized by calling this class's constructor directly, or using :func:`.load_all_data`. Includes peptide-spectrum matches, quantification info, and mappings between channels, samples, and sample groups. Data sets are automatically loaded, filtered, and merged by default. See :const:`.DEFAULT_FILTER_BAD` for default filtering parameters. See :func:`.DataSet.merge_duplicates` for info on how multiple peptide-spectrum matches are integrated together. Attributes ---------- search_name : str, optional Name of the search file this data set was loaded from. psms : :class:`pandas.DataFrame` Contains at least 'Proteins', 'Sequence', and 'Modifications' columns as well as any quantication data. channels : dict of str, str Maps label channel to sample name. groups : dict of str, list of str Maps groups to list of sample names. The primary group is considered as the first in this sequence. cmp_groups : list of list of str List of groups that are being compared. name : str Name of this data set. levels : dict or str, float Peptide levels used for normalization. intra_normalized : bool Indicates if the data set has been normalized within a TMT-plex analysis. inter_normalized : bool Indicates if the data set has been normalized for comparison across TMT-plex analyses. sets : int Number of sets merged into this data set. ''' def __init__( self, name='', psms=None, search_name=None, channels=None, groups=None, cmp_groups=None, fix_channel_names=True, dropna=False, pick_best_psm=True, constand_norm=False, merge_duplicates=True, filter_bad=True, check_raw=True, skip_load=False, skip_logging=False, ): ''' Initializes a data set. Parameters ---------- name : str, optional Name of the data set, by default this is also the .msf file that is used to load peptide-spectrum matches. psms : :class:`pandas.DataFrame`, optional Prebuilt dataframe of peptide-spectrum matches. search_name : str, optional Read psms from MASCOT / Discoverer data files. channels : dict of (str, str), optional Ordered dictionary mapping sample names to quantification channels (i.e. {'X': '126', 'Y': '127', 'Z': '128', 'W': '129'}) groups : dict of str, list of str, optional Ordered dictionary mapping sample names to larger groups (i.e. {'WT': ['X', 'Y'], 'Diseased': ['W', 'Z']}) cmp_groups : list of list of str cmp_groups that are passed to :func:`.DataSet.norm_cmp_groups()`. fix_channel_names : bool, optional dropna : bool, optional Drop scans that have any channels with missing quantification values. pick_best_psm : bool, optional Select the peptide sequence for each scan that has the highest MASCOT ion score. (i.e. ['pSTY': 5, 'SpTY': 10, 'STpY': 20] => 'STpY') constand_norm : bool, optional merge_duplicates : bool, optional Merge scans that have the same peptide sequence into one peptide, summing the quantification channel intensities to give a weighted estimate of relative abundances. filter_bad : bool or dict, optional Remove peptides that do not have a 'High' confidence score from ProteomeDiscoverer. check_raw : bool, optional skip_load : bool, optional Just initialize the structure, don't load any data. skip_logging : bool, optional Don't log any information. ''' if search_name is None: search_name = name if search_name and os.path.splitext(search_name)[1] == '': search_name += '.msf' self.channels = channels.copy() if channels else OrderedDict() self.groups = groups.copy() if groups else OrderedDict() self.cmp_groups = cmp_groups or None self.group_a, self.group_b = None, None species, lst = set(), [] if search_name and not skip_load: self.psms, species, lst = pyp.loading.load_psms( search_name, pick_best_psm=pick_best_psm, ) for col in DATA_SET_COLS: assert col in self.psms.columns else: self.psms = pd.DataFrame( columns=DATA_SET_COLS + list(self.channels.values()), ) if psms: self.add_peptide(psms) self.name = name self.levels = None self.search_name = search_name self.species = species self.intra_normalized = False self.inter_normalized = False self.sets = 1 if fix_channel_names: self.fix_channel_names() if check_raw: self.check_raw() if dropna: LOGGER.info( '{}: Dropping channels with NaN values.'.format(self.name) ) self.dropna(inplace=True) if filter_bad is True: filter_bad = DEFAULT_FILTER_BAD.copy() if pd.isnull(self.psms['q-value']).all() and 'q' in filter_bad: del filter_bad['q'] if pd.isnull(self.psms[list(self.channels.values())]).all().all(): del filter_bad['median_quant'] if not skip_logging: self.log_stats() if filter_bad: LOGGER.info( '{}: Filtering peptides using: {}' .format(self.name, filter_bad) ) self.filter( filter_bad, inplace=True, ) self.log_stats() if pick_best_psm and ( not search_name or os.path.splitext(search_name)[1] != '.msf' or any(i for i in lst) ): LOGGER.info( '{}: Picking peptides with best ion score for each scan.' .format(self.name) ) self._pick_best_psm() if constand_norm: channels = list(self.channels.values()) # Display quant distribution pyp.levels.get_channel_levels(self) # Save channel intensities as weights for data integration for channel in channels: weight = '{}_weight'.format(channel) self.psms[weight] = self.psms[channel] # Run CONSTANd normalization constand.constand(self, inplace=True) if merge_duplicates: LOGGER.info( '{}: Merging duplicate peptide hits together.' .format(self.name) ) self.merge_duplicates(inplace=True) if cmp_groups: self.norm_cmp_groups(cmp_groups, inplace=True) self.update_group_changes() if not skip_logging: self.log_stats()
[docs] def copy(self): ''' Make a copy of self. Returns ------- ds : :class:`.DataSet` ''' new = copy.copy(self) new.psms = new.psms.copy() new.channels = new.channels.copy() new.groups = new.groups.copy() new.species = new.species.copy() return new
[docs] def fix_channel_names(self): ''' Correct quantification channel names to those present in the search file. i.e. from 130_C to 130C (or vice versa). ''' for key, val in list(self.channels.items()): if '_' in val: new_val = val.replace('_', '') else: new_val = val[:-1] + '_' + val[-1] if ( val not in self.psms.columns and new_val in self.psms.columns ): LOGGER.info( '{}: Correcting {}: {} to {}'.format( self.name, key, val, new_val, ) ) self.channels[key] = new_val
@property def samples(self): ''' Get a list of sample names in this data set. Returns ------- list of str ''' return self.get_samples()
[docs] def get_samples(self, groups=None): ''' Get a list of sample names in this data set. Parameters ---------- groups : optional, list of (list of str) Returns ------- list of str ''' if groups is None: groups = self.cmp_groups or [list(self.groups.keys())] channel_names = [ sample_name for lst in groups for group in lst for sample_name in self.groups[group] if sample_name in self.channels ] # Remove duplicates channel_names = list(OrderedDict.fromkeys(channel_names)) return channel_names
def __str__(self): return ( '<pyproteome.DataSet object' + ( ': ' + self.name if self.name else ' at ' + hex(id(self)) ) + '>' ) def __getitem__(self, key): if isinstance(key, slice): new = self.copy() new.psms = new.psms[key] return new if any( isinstance(key, i) for i in [str, list, set, tuple, pd.Series, np.ndarray] ): return self.psms[key] raise TypeError(type(key)) def __setitem__(self, key, val): if any( isinstance(key, i) for i in [str, list, set, tuple, pd.Series, np.ndarray] ): self.psms[key] = val else: raise TypeError(type(key)) @property def shape(self): ''' Get the size of a data set in (rows, columns) format. Returns ------- shape : tuple of (int, int) ''' return self.psms.shape def _pick_best_psm(self): reject_mask = np.zeros(self.shape[0], dtype=bool) for index, row in self.psms.iterrows(): hits = np.logical_and( self.psms['Scan'] == row['Scan'], self.psms['Sequence'] != row['Sequence'], ) if 'Rank' in self.psms.columns: better = self.psms['Rank'] < row['Rank'] else: better = self.psms['Ion Score'] > row['Ion Score'] hits = np.logical_and(hits, better) if hits.any(): reject_mask[index] = True self.psms = self.psms[~reject_mask].reset_index(drop=True)
[docs] def merge_duplicates(self, inplace=False): ''' Merge together all duplicate peptides. New quantification values are calculated from a weighted sum of each channel's values. Parameters ---------- inplace : bool, optional Modify the data set in place, otherwise create a copy and return the new object. Returns ------- ds : :class:`.DataSet` ''' new = self if not inplace: new = new.copy() if new.shape[0] < 1: return new channels = list(new.channels.values()) agg_dict = OrderedDict() for channel in channels: weight = '{}_weight'.format(channel) std = '{}_std'.format(channel) mean = '{}_mean'.format(channel) cv = '{}_cv'.format(channel) if weight in new.psms.columns: new.psms[channel] *= new.psms[weight] agg_dict[weight] = 'sum' agg_dict[channel] = 'sum' if cv not in new.psms.columns: new.psms[std] = new.psms[channel] new.psms[mean] = new.psms[channel] agg_dict[std] = 'std' agg_dict[mean] = 'mean' else: agg_dict[cv] = 'mean' agg_dict['Modifications'] = _first agg_dict['Missed Cleavages'] = _first agg_dict['Validated'] = all agg_dict['Scan Paths'] = pyp.utils.flatten_set agg_dict['Raw Paths'] = pyp.utils.flatten_set agg_dict['Ambiguous'] = all agg_dict['Masses'] = pyp.utils.flatten_set agg_dict['Charges'] = pyp.utils.flatten_set agg_dict['Intensities'] = pyp.utils.flatten_set agg_dict['RTs'] = pyp.utils.flatten_set agg_dict['Scan'] = pyp.utils.flatten_set agg_dict['Ion Score'] = max agg_dict['q-value'] = min agg_dict['Confidence Level'] = partial( max, key=lambda x: ['Low', 'Medium', 'High'].index(x), ) agg_dict['Isolation Interference'] = min new.psms = new.psms.groupby( by=[ 'Proteins', 'Sequence', ], sort=False, as_index=False, ).agg(agg_dict).reset_index(drop=True) for channel in channels: weight = '{}_weight'.format(channel) std = '{}_std'.format(channel) mean = '{}_mean'.format(channel) cv = '{}_cv'.format(channel) if cv not in new.psms.columns: new.psms[cv] = new.psms[std] / new.psms[mean] del new.psms[std] del new.psms[mean] if weight in new.psms.columns: new.psms[channel] = ( new.psms[channel] / new.psms[weight] ) new.psms[channel] = new.psms[channel].replace(0, np.nan) return new
[docs] def merge_subsequences(self, inplace=False): ''' Merges petides that are a subsequence of another peptide. (i.e. SVYTEIKIHK + SVYTEIK -> SVYTEIK) Only merges peptides that contain the same set of modifications and that map to the same protein(s). Parameters ---------- inplace : bool, optional Returns ------- ds : :class:`.DataSet` ''' new = self if not inplace: new = new.copy() new['__sort__'] = new['Sequence'].apply(len) new.psms = new.psms.sort_values('__sort__', ascending=True) # Find all proteins that have more than one peptide mapping to them for index, row in new.psms[ new.psms.duplicated(subset='Proteins', keep=False) ].iterrows(): seq = row['Sequence'] # Then iterate over each peptide and find other non-identical # peptides that map to the same protein for o_index, o_row in new.psms[ np.logical_and( new.psms['Proteins'] == row['Proteins'], new.psms['Sequence'] != seq, ) ].iterrows(): # If that other peptide is a subset of this peptide, rename it if o_row['Sequence'] in seq: cols = [ 'Sequence', 'Modifications', 'Missed Cleavages', ] new.psms.at[o_index, cols] = row[cols] del new.psms['__sort__'] # And finally group together peptides that were renamed return new.merge_duplicates(inplace=inplace)
def __add__(self, other): ''' Concatenate two data sets. Combines two data sets, adding together the channel values for any common data. Parameters ---------- other : :class:`.DataSet` Returns ------- ds : :class:`.DataSet` ''' return merge_data([self, other])
[docs] def rename_channels(self, inplace=False): ''' Rename all columns names for quantification channels to sample names. (i.e. '126' => 'Mouse #1'). Parameters ---------- inplace : bool, optional Returns ------- ds : :class:`.DataSet` ''' new = self if not inplace: new = new.copy() for new_channel, old_channel in new.channels.items(): if new_channel != old_channel: new_weight = '{}_weight'.format(new_channel) if ( new_channel in new.psms.columns or new_weight in new.psms.columns ): raise Exception( 'Channel {} already exists, cannot rename to it' .format(new_channel) ) new.psms[new_channel] = new.psms[old_channel] del new.psms[old_channel] old_weight = '{}_weight'.format(old_channel) if old_weight in new.psms.columns: new.psms[new_weight] = new.psms[old_weight] del new.psms[old_weight] new.channels = OrderedDict([ (key, key) for key in new.channels.keys() ]) return new
[docs] def inter_normalize( self, norm_channels=None, other=None, inplace=False, ): ''' Normalize runs to one channel for inter-run comparions. Parameters ---------- other : :class:`.DataSet`, optional Second data set to normalize quantification values against, using a common normalization channels. norm_channels : list of str, optional Normalization channels to use for cross-run normalization. inplace : bool, optional Modify this data set in place. Returns ------- ds : :class:`.DataSet` ''' assert ( norm_channels is not None or other is not None ) new = self if not inplace: new = new.copy() new = new.rename_channels(inplace=inplace) if norm_channels is None: norm_channels = set(new.channels).intersection(other.channels) # If there are no common channels, do not change anything if len(norm_channels) == 0: return new # Filter norm channels to include only those in other data set norm_channels = [ chan for chan in norm_channels if not other or chan in other.channels ] for channel in new.channels.values(): weight = '{}_weight'.format(channel) if weight not in new.psms.columns: new.psms[weight] = ( new.psms[channel] * (100 - new.psms['Isolation Interference']) / 100 ) # Calculate the mean normalization signal from each shared channel new_mean = new.psms[norm_channels].mean(axis=1) # Drop values for which there is no normalization data new.psms = new.psms[~new_mean.isnull()].reset_index(drop=True) if other: merge = pd.merge( new.psms, other.psms, on=[ # 'ProteinStr', # 'PeptideStr', 'Proteins', 'Sequence', 'Modifications', ], how='left', suffixes=('_self', '_other'), ) assert merge.shape[0] == new.shape[0] self_mean = merge[ ['{}_self'.format(i) for i in norm_channels] ].mean(axis=1) other_mean = merge[ ['{}_other'.format(i) for i in norm_channels] ].mean(axis=1) for channel in new.channels.values(): vals = merge[ channel if channel in merge.columns else '{}_self'.format(channel) ] # Set scaling factor to 1 where other_mean is None cp = other_mean.copy() cp[other_mean.isnull()] = ( self_mean[other_mean.isnull()] ) if self_mean.any(): vals *= cp / self_mean assert new.shape[0] == vals.shape[0] new.psms[channel] = vals new.update_group_changes() new.inter_normalized = True return new
[docs] def normalize(self, lvls, inplace=False): ''' Normalize channels to given levels for intra-run comparisons. Divides all channel values by a given level. Parameters ---------- lvls : dict of str, float Mapping of channel names to normalized levels. All quantification values for each channel are divided by the normalization factor. inplace : bool, optional Modify this data set in place. Returns ------- ds : :class:`.DataSet` ''' new = self # Don't normalize a data set twice! assert not new.intra_normalized if not inplace: new = new.copy() if hasattr(lvls, 'levels'): if not lvls.levels: lvls.levels = pyp.levels.get_channel_levels(lvls)[1] lvls = lvls.levels new_channels = pyp.utils.norm(self.channels) for key, norm_key in zip( self.channels.values(), new_channels.values(), ): new.psms[norm_key] = new.psms[key] / lvls[key] del new.psms[key] new.intra_normalized = True new.channels = new_channels new.groups = self.groups.copy() new.update_group_changes() return new
[docs] def check_raw(self): ''' Checks that all raw files referenced in search data can be found in :const:`pyproteome.paths.MS_RAW_DIR`. Returns ------- found_all : bool ''' try: raw_dir = [ i.lower() for i in os.listdir(pyp.paths.MS_RAW_DIR) ] except OSError: raw_dir = [] found_all = True for raw in sorted( pyp.utils.flatten_set( row['Raw Paths'] for _, row in self.psms.iterrows() ) ): if raw.lower() not in raw_dir: LOGGER.warning( '{}: Unable to locate raw file for {}' .format(self.name, raw) ) found_all = False return found_all
[docs] def dropna( self, columns=None, how=None, thresh=None, groups=None, inplace=False, ): ''' Drop any channels with NaN values. Parameters ---------- columns : list of str, optional how : str, optional groups : list of str, optional Only drop rows with NaN in columns within groups. inplace : bool, optional Returns ------- ds : :class:`.DataSet` ''' new = self if not inplace: new = new.copy() if columns is None: columns = list(new.channels.values()) if how is None and thresh is None: how = 'any' if groups is not None: columns = [ new.channels[col] for group in groups for col in new.groups[group] if col in new.channels ] new.psms = new.psms.dropna( axis=0, how=how, thresh=thresh, subset=columns, ).reset_index(drop=True) return new
[docs] def add_peptide(self, inserts): ''' Manually add a peptide or list of peptides to a data set. Parameters ---------- insert : dict or list of dict Examples -------- This example demostrates how to manually insert a peptide that was manually validated by the user:: prots = data_sets.protein.Proteins( proteins=( data_sets.protein.Protein( accession='Q920G3', gene='Siglec5', description='Sialic acid-binding Ig-like lectin 5', full_sequence=( 'MRWAWLLPLLWAGCLATDGYSLSVTGSVTVQEGLCVFVACQVQYPNSKGPVFGYWFREGA' 'NIFSGSPVATNDPQRSVLKEAQGRFYLMGKENSHNCSLDIRDAQKIDTGTYFFRLDGSVK' 'YSFQKSMLSVLVIALTEVPNIQVTSTLVSGNSTKLLCSVPWACEQGTPPIFSWMSSALTS' 'LGHRTTLSSELNLTPRPQDNGTNLTCQVNLPGTGVTVERTQQLSVIYAPQKMTIRVSWGD' 'DTGTKVLQSGASLQIQEGESLSLVCMADSNPPAVLSWERPTQKPFQLSTPAELQLPRAEL' 'EDQGKYICQAQNSQGAQTASVSLSIRSLLQLLGPSCSFEGQGLHCSCSSRAWPAPSLRWR' 'LGEGVLEGNSSNGSFTVKSSSAGQWANSSLILSMEFSSNHRLSCEAWSDNRVQRATILLV' 'SGPKVSQAGKSETSRGTVLGAIWGAGLMALLAVCLCLIFFTVKVLRKKSALKVAATKGNH' 'LAKNPASTINSASITSSNIALGYPIQGHLNEPGSQTQKEQPPLATVPDTQKDEPELHYAS' 'LSFQGPMPPKPQNTEAMKSVYTEIKIHKC' ), ), ), ) seq = data_sets.sequence.extract_sequence(prots, 'SVyTEIK') mods = data_sets.modification.Modifications( mods=[ data_sets.modification.Modification( rel_pos=0, mod_type='TMT10plex', nterm=True, sequence=seq, ), data_sets.modification.Modification( rel_pos=2, mod_type='Phospho', sequence=seq, ), data_sets.modification.Modification( rel_pos=6, mod_type='TMT10plex', sequence=seq, ), ], ) seq.modifications = mods ckh_sigf1_py_insert = { 'Proteins': prots, 'Sequence': seq, 'Modifications': mods, '126': 1.46e4, '127N': 2.18e4, '127C': 1.88e4, '128N': 4.66e3, '128C': 6.70e3, '129N': 7.88e3, '129C': 1.03e4, '130N': 7.28e3, '130C': 2.98e3, '131': 6.01e3, 'Validated': True, 'First Scan': {23074}, 'Raw Paths': {'2019-04-24-CKp25-SiglecF-1-py-SpinCol-col189.raw'}, 'Scan Paths': {'CK-7wk-H1-pY'}, 'IonScore': 30, 'Isolation Interference': 0, } ds.add_peptide([ckh_sigf1_py_insert]) ''' defaults = { 'Proteins': protein.Proteins(), 'Sequence': sequence.Sequence(), 'Modifications': modification.Modifications(), 'Validated': False, 'Confidence Level': 'High', 'Ion Score': 100, 'q-value': 0, 'Isolation Interference': 0, 'Missed Cleavages': 0, 'Ambiguous': False, 'Charges': set(), 'Masses': set(), 'RTs': set(), 'Intensities': set(), 'Raw Paths': set(), 'Scan Paths': set(), 'Scan': set(), 'Fold Change': np.nan, 'p-value': np.nan, } if isinstance(inserts, dict): inserts = [inserts] for insert in inserts: for key, val in defaults.items(): if key not in insert: insert[key] = val for col in DATA_SET_COLS: assert col in insert self.psms = self.psms.append(pd.Series(insert), ignore_index=True)
[docs] def filter( self, filters=None, inplace=False, **kwargs ): ''' Filters a data set. Parameters ---------- filters : list of dict or dict, optional List of filters to apply to data set. Filters are also pulled from `kwargs` (see below). inplace : bool, optional Perform the filter on self, other create a copy and return the new object. Notes ----- These parameters filter your data set to only include peptides that match a given attribute. For example:: >>> data.filter(mod='Y', p=0.01, fold=2) This function interprets both the argument filter and python `kwargs` magic. The three functions are all equivalent:: >>> data.filter(p=0.01) >>> data.filter([{'p': 0.01}]) >>> data.filter({'p': 0.01}) Filter parameters can be one of any below: ================ =================================================== Name Description ================ =================================================== series Use a pandas series (data.psms[series]). fn Use data.psms.apply(fn). group_a Calculate p / fold change values from group_a. group_b Calculate p / fold change values from group_b. ambiguous Include peptides with ambiguous PTMs if true, filter them out if false. confidence Discoverer's peptide confidence (High|Medium|Low). ion_score MASCOT's ion score. isolation Discoverer's isolation inference. missed_cleavage Missed cleaves <= cutoff. median_quant Median quantification signal >= cutoff. median_cv Median coefficient of variation <= cutoff. p p-value < cutoff. q q-value < cutoff. asym_fold Change > val if cutoff > 1 else Change < val. fold Change > cutoff or Change < 1 / cutoff. motif Filter for motif. protein Filter for protein or list of proteins. protein Filter for protein or list of proteins. accession Filter for protein or list of UniProt accessions. sequence Filter for sequence or list of sequences. mod Filter for modifications. only_validated Use rows validated by CAMV. any Use rows that many any filter. inverse Use all rows that are rejected by a filter. rename Change the new data sets name to a new value. ================ =================================================== Returns ------- ds : :class:`.DataSet` ''' new = self if filters is None: filters = [] if filters and not isinstance(filters, (list, tuple)): filters = [filters] if kwargs: filters += [kwargs] if not inplace: new = new.copy() filters = copy.deepcopy(filters) confidence = { 'High': ['High'], 'Medium': ['Medium', 'High'], 'Low': ['Low', 'Medium', 'High'], } fns = { 'series': lambda val, psms: val, 'fn': lambda val, psms: psms.apply(val, axis=1), 'ambiguous': lambda val, psms: psms['Ambiguous'] == val, 'confidence': lambda val, psms: psms['Confidence Level'].isin(confidence[val]), 'ion_score': lambda val, psms: psms['Ion Score'] >= val, 'isolation': lambda val, psms: psms['Isolation Interference'] <= val, 'missed_cleavage': lambda val, psms: psms['Missed Cleavages'] <= val, 'median_quant': lambda val, psms: np.nan_to_num( np.nanmedian( psms[ [chan for chan in new.channels.values()] ], axis=1, ) ) >= val, 'median_cv': lambda val, psms: np.nan_to_num( np.nanmedian( psms[ [ '{}_CV'.format(chan) for chan in new.channels.values() ] ], axis=1, ) ) <= val, 'p': lambda val, psms: (~psms['p-value'].isnull()) & (psms['p-value'] <= val), 'q': lambda val, psms: (~psms['q-value'].isnull()) & (psms['q-value'] <= val), 'asym_fold': lambda val, psms: (~psms['Fold Change'].isnull()) & ( psms['Fold Change'] >= val if val > 1 else psms['Fold Change'] <= val ), 'fold': lambda val, psms: (~psms['Fold Change'].isnull()) & ( psms['Fold Change'].apply( lambda x: x if x > 1 else (1 / x if x else x) ) >= (val if val > 1 else 1 / val) ), 'motif': lambda val, psms: psms['Sequence'].apply( lambda x: any( ( pyp.motifs.motif.Motif(val) if isinstance(val, str) else val ).match(nmer) for nmer in pyp.motifs.generate_n_mers( x, mods=f.get('mod', None), ) ) ), 'protein': lambda val, psms: psms['Proteins'].apply( lambda x: bool(set(val).intersection(x.genes)) ) if isinstance(val, (list, set, tuple, pd.Series)) else psms['Proteins'] == val.strip(), 'accession': lambda val, psms: psms['Proteins'].apply( lambda x: bool(set(val).intersection(x.accessions)) ) if isinstance(val, (list, set, tuple, pd.Series)) else psms['Proteins'].apply( lambda x: any([i == val.strip() for i in x.accessions]) ), 'sequence': lambda val, psms: psms['Sequence'].apply(lambda x: any(i in x for i in val)) if isinstance(val, (list, set, tuple, pd.Series)) else psms['Sequence'] == val.strip(), 'mod': lambda val, psms: psms['Modifications'].apply( lambda x: bool(list(x.get_mods(val).skip_labels())) ), 'only_validated': lambda val, psms: psms['Validated'] == val, 'scan_paths': lambda val, psms: psms['Scan Paths'] .apply(lambda x: any(i in val for i in x)) } with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', r'All-NaN (slice|axis) encountered', ) for f in filters: group_a = f.pop('group_a', None) group_b = f.pop('group_b', None) if group_a or group_b: new.update_group_changes( group_a=group_a, group_b=group_b, ) inverse = f.pop('inverse', False) rename = f.pop('rename', None) if rename is not None: new.name = rename if f.pop('any', False): if new.shape[0] < 1: return new mask = pd.Series( [inverse] * new.shape[0], index=new.psms.index, ) for key, val in f.items(): # Skip filtering if psms is empty f_mask = fns[key](val, new.psms) if inverse: f_mask = ~f_mask mask |= f_mask assert mask.shape[0] == new.shape[0] new.psms = new.psms.loc[mask].reset_index(drop=True) else: for key, val in f.items(): # Skip filtering if psms is empty if new.shape[0] < 1: continue mask = fns[key](val, new.psms) if inverse: mask = ~mask assert mask.shape[0] == new.shape[0] new.psms = new.psms.loc[mask].reset_index(drop=True) new.psms.reset_index(inplace=True, drop=True) return new
[docs] def get_groups(self, group_a=None, group_b=None): ''' Get channels associated with two groups. Parameters ---------- group_a : str or list of str, optional group_b : str or list of str, optional Returns ------- samples : list of str labels : list of str groups : tuple of (str or list of str) ''' groups = [ val for key, val in self.groups.items() if any(chan in self.channels for chan in val) ] labels = [ key for key, val in self.groups.items() if any(chan in self.channels for chan in val) ] group_a = group_a or self.group_a group_b = group_b or self.group_b if group_a is None: label_a = labels[0] if labels else None samples_a = groups[0] if groups else [] elif isinstance(group_a, str): label_a = group_a samples_a = self.groups[group_a] else: group_a = [ group for group in group_a if any( sample in self.channels for sample in self.groups[group] ) ] label_a = ', '.join(group_a) samples_a = [ sample for i in group_a for sample in self.groups[i] if sample in self.channels ] if group_b is None: label_b = labels[1] if labels[1:] else None samples_b = groups[1] if groups[1:] else [] elif isinstance(group_b, str): label_b = group_b samples_b = self.groups[group_b] else: group_b = [ group for group in group_b if any( sample in self.channels for sample in self.groups[group] ) ] label_b = ', '.join(group_b) samples_b = [ sample for i in group_b for sample in self.groups[i] if sample in self.channels ] return (samples_a, samples_b), (label_a, label_b), (group_a, group_b)
[docs] def update_group_changes(self, group_a=None, group_b=None): ''' Update a DataSet's Fold Change, and p-value for each peptide using the give two-group comparison. Values are calculated based on changes between group_a and group_b. p-values are calculated as a 2-sample t-test. Parameters ---------- psms : :class:`pandas.DataFrame` group_a : str or list of str, optional Single or multiple groups to use for fold change numerator. group_b : str or list of str, optional Single or multiple groups to use for fold change denominator. ''' (group_a, group_b), _, (self.group_a, self.group_b) = self.get_groups( group_a=group_a, group_b=group_b, ) channels_a = [ self.channels[i] for i in group_a if i in self.channels ] channels_b = [ self.channels[i] for i in group_b if i in self.channels ] if channels_a and channels_b and self.shape[0] > 0: with warnings.catch_warnings(): warnings.simplefilter('ignore', category=RuntimeWarning) self.psms['Fold Change'] = pd.Series( np.nanmean(self.psms[channels_a], axis=1) / np.nanmean(self.psms[channels_b], axis=1), index=self.psms.index, ) pvals = ttest_ind( self.psms[channels_a], self.psms[channels_b], axis=1, nan_policy='omit', )[1] if ma.is_masked(pvals): pvals = ma.fix_invalid(pvals, fill_value=np.nan) elif isinstance(pvals, float): pvals = [pvals] elif pvals.shape == (): pvals = [pvals] self.psms['p-value'] = pd.Series(pvals, index=self.psms.index) else: self.psms['Fold Change'] = np.nan self.psms['p-value'] = np.nan
[docs] def norm_cmp_groups(self, cmp_groups, ctrl_groups=None, inplace=False): ''' Normalize between groups in a list. This can be used to compare data sets that have comparable control groups. Channnels within each list of groups are normalized to the mean of the group's channels. Parameters ---------- cmp_groups : list of list of str List of groups to be normalized to each other. i.e. [('CK Hip', 'CK-p25 Hip'), ('CK Cortex', 'CK-p25 Cortex')] ctrl_groups : list of str, optional List of groups to use for baseline normalization. If not set, the first group from each comparison will be used. inplace : bool, optional Modify the data set in place, otherwise create a copy and return the new object. Examples -------- >>> channels = ['a', 'b', 'c', 'd'] >>> groups = {i: [i] for i in channels} >>> ds = data_sets.DataSet(channels=channels, groups=groups) >>> ds.add_peptide({'a': 1000, 'b': 500, 'c': 100, 'd': 25}) >>> ds = ds.norm_cmp_groups([['a', 'b'], ['c', 'd']]) >>> ds.data {'a': 1, 'b': 0.5, 'c': 1, 'd': .25} Returns ------- ds : :class:`.DataSet` ''' new = self if not inplace: new = new.copy() if new.cmp_groups: raise Exception('cmp_groups normalization already set') new.cmp_groups = [ tuple(i) for i in cmp_groups ] # if len(cmp_groups) < 2: # return new # assert not any( # group in [ # i # for o_groups in cmp_groups[ind + 1:] # for i in o_groups # ] # for ind, groups in enumerate(cmp_groups) # for group in groups # ) for groups in cmp_groups: channels = [ [ new.channels[name] for name in new.groups[group] if name in new.channels ] for group in groups ] group_names = [ group for ind, group in enumerate(groups) ] if not any([len(i) > 0 for i in channels]): continue vals = new[ [chan for chan_group in channels for chan in chan_group] ] if ctrl_groups: norm_vals = vals[[ chan for i in ctrl_groups if i in group_names for chan in channels[group_names.index(i)] ]].median(axis=1) else: norm_vals = vals[ channels[0] ].median(axis=1) vals = vals.apply(lambda col: col / norm_vals) new.psms[ [chan for chan_group in channels for chan in chan_group] ] = vals return new
[docs] def log_stats(self): ''' Log statistics information about peptides contained in the data set. This information includes total numbers, phospho-specificity, modification ambiguity, completeness of labeling, and missed cleavage counts. ''' data_p = self.filter(mod='Phospho') data_no_p = self.filter(mod='Phospho', inverse=True) data_pst = self.filter(mod=[('S', 'Phospho'), ('T', 'Phospho')]) data_py = self.filter(mod=[('Y', 'Phospho')]) LOGGER.info('{}: Data Set Statistics:'.format(self.name)) LOGGER.info( ( '{}: -- {} pY - {} pST - {} total ({:.0%} phospho specificity)' ).format( self.name, len(data_py.psms), len(data_pst.psms), len(self.psms), len(data_p.psms) / max([len(self.psms), 1]), ) ) LOGGER.info( ( '{}: -- {} unique proteins' ).format( self.name, len(self.genes), ) ) if len(data_p.psms) > 0: per_prot_quant = ( len(set(data_p.genes).intersection(data_no_p.genes)) / len(data_p.psms) ) LOGGER.info( ( '{}: -- {:.0%} of phosphopeptides have ' 'protein quantification' ).format(self.name, per_prot_quant) ) per_amb = ( data_p.filter(ambiguous=True).shape[0] / max([data_p.shape[0], 1]) ) LOGGER.info( ( '{}: -- {:.0%} of phosphopeptides have an ambiguous assignment' ).format(self.name, per_amb) ) labeled = self.filter( fn=lambda x: x['Sequence'].is_labeled ).shape[0] underlabeled = self.filter( fn=lambda x: x['Sequence'].is_underlabeled ).shape[0] per_lab = labeled / max([self.shape[0], 1]) per_under_lab = underlabeled / max([self.shape[0], 1]) LOGGER.info( ( '{}: -- {:.0%} labeled - {:.0%} underlabeled' ).format(self.name, per_lab, per_under_lab) ) mmc = self.psms['Missed Cleavages'].mean() LOGGER.info( ( '{}: -- {:.1f} mean missed cleavages' ).format(self.name, mmc) )
@property def genes(self): ''' Get all uniprot gene names occuring in this data set. Returns ------- list of str ''' return sorted( set( gene for i in self.psms['Proteins'] for gene in i.genes ) ) @property def phosphosites(self): ''' Get a list of all unique phosphosites identified in a data set. Returns ------- list of str Examples -------- >>> ds.phosphosites ['Siglec5 pY561', 'Stat3 pY705', 'Inpp5d pY868'] ''' return sorted( set( self.psms.apply( lambda x: '{0}{1}{2}'.format( pyp.utils.get_name(x['Proteins']), ' : ' if len(x['Modifications'].get_mods('Phospho')) > 0 else '', re.sub( r'(\d+)', r'\1', x['Modifications'].get_mods('Phospho').__str__( prot_index=0, show_mod_type=False, ), ), ), axis=1, ).values ) ) @property def accessions(self): ''' Get all uniprot accessions occuring in this data set. Returns ------- list of str Examples -------- >>> ds.accessions ['P42227', 'Q920G3', 'Q9ES52'] ''' return sorted( set( gene for i in self.psms['Proteins'] for gene in i.accessions ) ) @property def data(self): ''' Get the quantification data for all samples and peptides in a data set. Returns ------- df : :class:`pandas.DataFrame` ''' return self.get_data()
[docs] def get_data(self, groups=None, mods=None, short_name=False): ''' Get the quantification data for all samples and peptides in a data set, with more customizable options. Parameters ---------- groups : list of str, optional Select samples from a given list of groups, otherwise select all samples. mods : str or list of str, optional Option passed to :func:`.modification.Modification.__str__`. short_name : bool, optional Use the short abbreviation of a gene name (using :func:`pyproteome.utils.get_name`). Otherwise use the long version. Returns ------- df : :class:`pandas.DataFrame` ''' if groups is None: channel_names = self.samples d = self.psms[ [ self.channels[chan] for chan in channel_names ] ] d.index = self.psms.apply( lambda row: '{} {} - {}'.format( ( pyp.utils.get_name(row['Proteins']) if short_name else ' / '.join(row['Proteins'].genes)[:16] ), ( row['Modifications'].get_mods(mods) if mods else row['Modifications'] ).__str__(prot_index=0), row['Sequence'], ), axis=1, ) return d
@property def intensity_data(self, groups=None, norm_cmp=False): ''' Get the quantification data for all samples and peptides in a data set. Parameters ---------- norm_cmp : bool, optional Returns ------- df : :class:`pandas.DataFrame` ''' if groups is None: channel_names = self.samples d = self.psms[ [ self.channels[chan] + '_weight' for chan in channel_names ] ] d.index = self.psms.apply( lambda row: '{} {} - {}'.format( ' / '.join(row['Proteins'].genes)[:16], row['Modifications'].__str__(prot_index=0), row['Sequence'], ), axis=1, ) d.columns = [i.rsplit('_', 1)[0] for i in d.columns] return d
[docs]def load_all_data( chan_mapping=None, group_mapping=None, loaded_fn=None, norm_mapping=None, merge_mapping=None, merged_fn=None, kw_mapping=None, merge_only=True, replace_norm=True, **kwargs ): ''' Load, normalize, and merge all data sets found in :const:`pyproteome.paths.MS_SEARCHED_DIR`. Parameters ---------- chan_mapping : dict, optional group_mapping : dict, optional loaded_fn : func, optional norm_mapping : dict, optional merge_mapping : dict, optional merged_fn : func, optional kw_mapping : dict of (str, dict) merge_only : bool, optional replace_norm : bool, optional If true, only keep the normalized version of a data set. Otherwise return both normalized and unnormalized version. kwargs : dict Any extra arguments are passed directly to :class:`.DataSet` during initialization. Returns ------- datas : dict of str, :class:`.DataSet` Examples -------- This example demostrates how to automatically load, filter, normalize, and together several data sets:: ckh_channels = OrderedDict( [ ('3130 CK Hip', '126'), ('3131 CK-p25 Hip', '127'), ('3145 CK-p25 Hip', '128'), ('3146 CK-p25 Hip', '129'), ('3148 CK Hip', '130'), ('3157 CK Hip', '131'), ] ) ckx_channels = OrderedDict( [ ('3130 CK Cortex', '126'), ('3131 CK-p25 Cortex', '127'), ('3145 CK-p25 Cortex', '128'), ('3146 CK-p25 Cortex', '129'), ('3148 CK Cortex', '130'), ('3157 CK Cortex', '131'), ] ) ckp25_groups = OrderedDict( [ ( 'CK', [ '3130 CK Hip', '3148 CK Hip', '3157 CK Hip', '3130 CK Cortex', '3148 CK Cortex', '3157 CK Cortex', ], ), ( 'CK-p25', [ '3131 CK-p25 Hip', '3145 CK-p25 Hip', '3146 CK-p25 Hip', '3131 CK-p25 Cortex', '3145 CK-p25 Cortex', '3146 CK-p25 Cortex', ], ), ] ) # With search data located as follows: # Searched/ # CK-H1-pY.msf # CK-H1-pST.msf # CK-H1-Global.msf # CK-X1-pY.msf # CK-X1-pST.msf # CK-X1-Global.msf # Load each data set, normalized to its respective global proteome analysis: datas = data_sets.load_all_data( chan_mapping={ 'CK-H': ckh_channels, 'CK-X': ckx_channels, }, # Normalize pY, pST, and Global runs to each sample's global data norm_mapping={ 'CK-H1': 'CK-H1-Global', 'CK-X1': 'CK-X1-Global', ]), # Merge together normalized hippocampus and cortex runs merge_mapping={ 'CK Hip': ['CK-H1-pY', 'CK-H1-pST', 'CK-H1-Global'], 'CK Cortex': ['CK-X1-pY', 'CK-X1-pST', 'CK-X1-Global'], 'CK All': ['CK Hip', 'CK Cortex'], }, groups=ckp25_groups, ) # Alternatively, load each data set, using CONSTANd normalization: data_sets.constand.DEFAULT_CONSTAND_COL = 'kde' datas = data_sets.load_all_data( chan_mapping={ 'CK-H': ckh_channels, 'CK-X': ckx_channels, }, norm_mapping='constand', # Merge together normalized hippocampus and cortex runs merge_mapping={ 'CK Hip': ['CK-H1-pY', 'CK-H1-pST', 'CK-H1-Global'], 'CK Cortex': ['CK-X1-pY', 'CK-X1-pST', 'CK-X1-Global'], 'CK All': ['CK Hip', 'CK Cortex'], }, groups=ckp25_groups, ) ''' chan_mapping = chan_mapping or {} group_mapping = group_mapping or {} norm_mapping = norm_mapping or {} merge_mapping = merge_mapping or {} kw_mapping = kw_mapping or {} datas = OrderedDict() for f in sorted(os.listdir(pyp.paths.MS_SEARCHED_DIR)): name, ext = os.path.splitext(f) if ext not in ['.msf']: continue if ( merge_mapping and merge_only and name not in pyp.utils.flatten_set(list(merge_mapping.values())) ): continue kws = kwargs.copy() kws.update(kw_mapping.get(name, {})) # Run CONSTANd normalization before data filtering and integration if 'constand_norm' not in kws and norm_mapping == 'constand': kws['constand_norm'] = True chan = kws.pop('channels', None) group = kws.pop('groups', None) for key, val in chan_mapping.items(): if name.startswith(key): chan = val break for key, val in group_mapping.items(): if name.startswith(key): group = val break datas[name] = DataSet( name=name, channels=chan, groups=group, **kws ) if loaded_fn: datas[name] = loaded_fn(name, datas[name]) datas, mapped_names = norm_all_data( datas, norm_mapping, replace_norm=replace_norm, ) datas = merge_all_data( datas, merge_mapping, mapped_names=mapped_names, merged_fn=merged_fn, ) return datas
[docs]def norm_all_data( datas, norm_mapping, replace_norm=True, inplace=True, ): ''' Normalize all data sets. Parameters ---------- datas : dict of (str, :class:`.DataSet`) norm_mapping : dict of (str, str) replace_norm : bool, optional inplace : bool, optional Modify datas object inplace. Returns ------- datas : dict of (str, :class:`.DataSet`) mapped_names : dict of (str, str) ''' mapped_names = OrderedDict() if not inplace: datas = datas.copy() constand_norm = norm_mapping in ['constand'] if norm_mapping in ['self', 'constand']: norm_mapping = { name: name + ('' if replace_norm else '-norm') for name in datas.keys() if not name.endswith('-norm') } for name, data in list(datas.items()): if name.endswith('-norm'): continue for key, val in norm_mapping.items(): if not name.startswith(key): continue mapped_names[name] = '{}-norm'.format(name) if not constand_norm: data = data.normalize(datas[val], inplace=replace_norm) else: # data = constand.constand( # data, # inplace=replace_norm, # ) pass data.name += '-norm' datas[mapped_names[name]] = data if replace_norm: del datas[name] break return datas, mapped_names
[docs]def merge_all_data( datas, merge_mapping, mapped_names=None, merged_fn=None, inplace=True, ): ''' Merge together multiple data sets. Parameters ---------- datas : dict of (str, :class:`.DataSet`) merge_mapping : dict of (str, list of str) mapped_names : dict of (str, str), optional merged_fn : func, optional inplace : bool, optional Modify datas object inplace. Returns ------- datas : dict of (str, :class:`.DataSet`) ''' if not inplace: datas = datas.copy() if mapped_names is None: mapped_names = {} rmap = {val: key for key, val in mapped_names.items()} if merge_mapping: for name in datas.keys(): if not any( name in vals or rmap.get(name, name) in vals or name == key for key, vals in merge_mapping.items() ): LOGGER.warning('Unmerged data: {}'.format(name)) for key, vals in merge_mapping.items(): if not any([mapped_names.get(val, val) in datas for val in vals]): continue datas[key] = merge_data( [ datas[mapped_names.get(val, val)] for val in vals if mapped_names.get(val, val) in datas ], name=key, ) if merged_fn: datas[key] = merged_fn(key, datas[key]) return datas
[docs]def merge_data( data_sets, name=None, norm_channels=None, merge_duplicates=True, ): ''' Merge a list of data sets together. Parameters ---------- data_sets : list of :class:`.DataSet` name : str, optional norm_channels : dict of (str, str) merge_duplicates : bool, optional Returns ------- ds : :class:`.DataSet` ''' new = DataSet( name=name, skip_logging=True, skip_load=True, filter_bad=False, constand_norm=False, ) if len(data_sets) < 1: return new # if any(not isinstance(data, DataSet) for data in data_sets): # raise TypeError( # 'Incompatible types: {}'.format( # [type(data) for data in data_sets] # ) # ) for index, data in enumerate(data_sets): data = data.rename_channels() # Update new.groupss for group, samples in data.groups.items(): if group not in new.groups: new.groups[group] = samples continue new.groups[group] += [ sample for sample in samples if sample not in new.groups[group] ] if not data.inter_normalized: # Normalize data sets to their common channels inter_norm = norm_channels if not inter_norm: inter_norm = set(data.channels) if index > 0: inter_norm = inter_norm.intersection(new.channels) elif len(data_sets) > 1: inter_norm = inter_norm.intersection( data_sets[1].channels ) data = data.inter_normalize( other=new if index > 0 else None, norm_channels=inter_norm, ) for key, val in data.channels.items(): assert new.channels.get(key, val) == val if key not in new.channels: new.channels[key] = val new.psms = _concat( [new.psms, data.psms], ).reset_index(drop=True) if merge_duplicates: new.merge_duplicates(inplace=True) new.sets = sum(data.sets for data in data_sets) new.inter_normalized = True new.group_a = next( (i.group_a for i in data_sets if i.group_a is not None), None, ) new.group_b = next( (i.group_b for i in data_sets if i.group_b is not None), None, ) new.cmp_groups = sorted( set( grp for i in data_sets if i.cmp_groups is not None for grp in i.cmp_groups ) ) or None new.species = set( species for i in data_sets for species in i.species ) new.update_group_changes() if name: new.name = name new.log_stats() return new
def _unfirst(x): return x.values[0] def _first(x): if not all(i == x.values[0] for i in x.values): LOGGER.warning( 'Mismatch between peptide data: \'{}\' not in {}' .format( x.values[0], [str(i) for i in x.values[1:]], ) ) return x.values[0]
[docs]def merge_proteins(ds, inplace=False, fn=None): ''' Merge together all peptides mapped to the same protein. Maintains the first available peptide and calculates the median quantification value for each protein across all of its peptides. Parameters ---------- ds : :class:`.DataSet` inplace : bool, optional Returns ------- ds : :class:`.DataSet` ''' new = ds if not inplace: new = new.copy() if len(new.psms) < 1: return new if fn is None: fn = 'median' channels = list(new.channels.values()) agg_dict = {} for channel in channels: weight = '{}_weight'.format(channel) if weight in new.psms.columns: # XXX: Use weight corresponding to median channel value? # (not median weight) agg_dict[weight] = fn agg_dict[channel] = fn agg_dict['Sequence'] = _unfirst agg_dict['Modifications'] = _unfirst agg_dict['Missed Cleavages'] = _unfirst agg_dict['Validated'] = all agg_dict['Scan Paths'] = _unfirst agg_dict['Raw Paths'] = _unfirst agg_dict['Ambiguous'] = _unfirst agg_dict['Masses'] = _unfirst agg_dict['Charges'] = _unfirst agg_dict['Intensities'] = _unfirst agg_dict['RTs'] = _unfirst agg_dict['Scan'] = _unfirst agg_dict['Ion Score'] = max agg_dict['q-value'] = min agg_dict['Confidence Level'] = partial( max, key=lambda x: ['Low', 'Medium', 'High'].index(x), ) agg_dict['Isolation Interference'] = min agg_dict['Unique Peptides'] = len new.psms['Unique Peptides'] = np.nan new.psms = new.psms.groupby( by=[ 'Proteins', ], sort=False, as_index=False, ).agg(agg_dict) new.update_group_changes() return new
def _cv(x): x = variation(x, nan_policy='omit') return np.nan if ma.is_masked(x) else x
[docs]def update_correlation(ds, corr, metric='spearman', min_periods=5): ''' Update a table's Fold-Change, and p-value columns. Values are calculated based on changes between group_a and group_b. Parameters ---------- ds : :class:`.DataSet` corr : :class:`pandas.Series` metric : str, optional min_periods : int, optional Returns ------- ds : :class:`.DataSet` ''' ds = ds.copy() corr = pd.to_numeric( corr[(~corr.isnull()) & (corr.index.isin(ds.psms.columns))] ) metric = { 'spearman': lambda x: spearmanr( x, b=corr, nan_policy='omit', ) if (~x.isnull()).sum() > 2 else (np.nan, np.nan), 'pearson': lambda x: pearsonr( x[~x.isnull()], y=corr[~x.isnull()], ) if (~x.isnull()).sum() > 2 else (np.nan, np.nan), }[metric] # Avoids errors with correlation of 2 points cols = list(corr.index) df = ds.psms[cols].apply(pd.to_numeric) vals = df.apply(metric, axis=1, result_type='expand') vals[ (~df.isnull()).sum(axis=1) < min_periods ] = np.nan ds.psms[['Fold Change', 'p-value']] = vals return ds
def _pair_corr(row, x_val, y_val, z_val=None, method='spearman'): covar = ['x', 'y'] if z_val is not None: covar += ['z'] pcorr_df = pd.DataFrame(zip( pd.to_numeric(row[x_val.index.tolist()]), x_val, y_val, z_val if z_val is not None else [1] * len(x_val), ), columns=['row', 'x', 'y', 'z']) x_pair_corr = pcorr_df.pairwise_corr( covar=[i for i in covar if i != 'x'], method=method, ) y_pair_corr = pcorr_df.pairwise_corr( covar=[i for i in covar if i != 'y'], method=method, ) if z_val is not None: z_pair_corr = pcorr_df.pairwise_corr( covar=[i for i in covar if i != 'z'], method=method, ) return pd.Series( ( x_pair_corr['r'].iloc[0], x_pair_corr['p-unc'].iloc[0], y_pair_corr['r'].iloc[0], y_pair_corr['p-unc'].iloc[0], z_pair_corr['r'].iloc[0] if z_val is not None else None, z_pair_corr['p-unc'].iloc[0] if z_val is not None else None, )[:6 if z_val is not None else 4], index=[ 'Correlation-x', 'p-value-x', 'Correlation-y', 'p-value-y', 'Correlation-z', 'p-value-z', ][:6 if z_val is not None else 4], )
[docs]def update_pairwise_corr(ds, x_val, y_val, inplace=False): if not inplace: ds = ds.copy() vals = ds.psms.apply( partial(_pair_corr, x_val=x_val, y_val=y_val), axis=1, result_type='expand', ) ds[vals.columns.tolist()] = vals return ds
def _concat(dfs): cols = [] for frame in dfs: for col in frame.columns: if col not in cols: cols.append(col) df_dict = dict.fromkeys(cols, []) def _fast_flatten(input_list): return list(chain.from_iterable(input_list)) for col in cols: # Flatten and save to df_dict df_dict[col] = list( chain.from_iterable( frame[col] if col in frame.columns else [np.nan] * frame.shape[0] for frame in dfs ) ) df = pd.DataFrame.from_dict(df_dict)[cols] return df