Source code for pyproteome.utils

'''Utility functions used in other modules.'''

# Built-ins
from collections import OrderedDict, Callable
import copy
import difflib
import functools
import os
import pickle
import types

import numpy as np
import pandas as pd

from . import paths


DEFAULT_DPI = 300
'''
The DPI to use when generating all image figures.
'''


[docs]def fuzzy_find(needle, haystack): ''' Find the longest matching subsequence of needle within haystack. Returns the corresponding index from the beginning of needle. Parameters ---------- needle : str haystack : str Returns ------- index : int ''' s = difflib.SequenceMatcher(a=haystack, b=needle) best = s.find_longest_match(0, len(haystack), 0, len(needle)) return best.a - len(needle) + best.size
[docs]def make_folder(data=None, folder_name=None, sub='Output'): if folder_name is None: folder_name = os.path.join( paths.FIGURES_DIR, data.name if data is not None else 'All', sub, ) return makedirs(folder_name)
[docs]def makedirs(folder_name=None): ''' Creates a folder if it does not exist. Parameters ---------- folder_name : str, optional Returns ------- folder_name : str ''' if folder_name: try: os.makedirs(folder_name) except OSError: pass return folder_name
[docs]def norm(channels): ''' Converts a list of channels to their normalized names. Parameters ---------- channels : list of str or dict of (str, str) or None Returns ------- new_channels : list of str or dict of str, str ''' if channels is None: return None if isinstance(channels, str): return channels + '_norm' if isinstance(channels, list): return [norm(i) for i in channels] if isinstance(channels, (dict, OrderedDict)): return OrderedDict( (key, norm(val)) for key, val in channels.items() )
[docs]def which(program): ''' Checks if a program exists in PATH's list of directories. Parameters ---------- program : str Returns ------- path : str or None ''' def is_exe(fpath): return os.path.isfile(fpath) and os.access(fpath, os.X_OK) fpath, _ = os.path.split(program) if fpath: if is_exe(program): return program else: for path in os.environ['PATH'].split(os.pathsep): path = path.strip('\'') exe_file = os.path.join(path, program) if is_exe(exe_file): return exe_file return None
[docs]def flatten_set(lst): ''' Flattens an Iterable with arbitrary nesting into a single set. Parameters ---------- lst : Iterable Returns ------- flattened : set Examples -------- >>> utils.flatten_set([0, [1, 2], [[3]], 'string']) set([0, 1, 2, 3, 'string']) ''' if isinstance( lst, (list, tuple, set, types.GeneratorType, pd.Series, np.ndarray) ): ret = set() for element in lst: for new_element in flatten_set(element): ret.add(new_element) return ret return set([lst])
[docs]def flatten_list(lst): ''' Flattens an Iterable with arbitrary nesting into a single list. Parameters ---------- lst : Iterable Returns ------- flattened : list Examples -------- >>> utils.flatten_list([0, [1, 2], [[3]], 'string']) [0, 1, 2, 3, 'string'] ''' if isinstance( lst, (list, tuple, set, types.GeneratorType, pd.Series, np.ndarray) ): ret = [] for element in lst: for new_element in flatten_list(element): ret.append(new_element) return ret return [lst]
[docs]class DefaultOrderedDict(OrderedDict): # Source: http://stackoverflow.com/a/6190500/562769 def __init__(self, default_factory=None, *a, **kw): if (default_factory is not None and not isinstance(default_factory, Callable)): raise TypeError('first argument must be callable') OrderedDict.__init__(self, *a, **kw) self.default_factory = default_factory def __getitem__(self, key): try: return OrderedDict.__getitem__(self, key) except KeyError: return self.__missing__(key) def __missing__(self, key): if self.default_factory is None: raise KeyError(key) self[key] = value = self.default_factory() return value def __reduce__(self): if self.default_factory is None: args = tuple() else: args = self.default_factory, return type(self), args, None, None, self.items()
[docs] def copy(self): return self.__copy__()
def __copy__(self): return type(self)(self.default_factory, self) def __deepcopy__(self, memo): return type(self)(self.default_factory, copy.deepcopy(self.items())) def __repr__(self): return 'OrderedDefaultDict(%s, %s)' % ( self.default_factory, OrderedDict.__repr__(self), )
[docs]def memoize(func): ''' Memoize a function, saving its returned value for a given set of parameters in an in-memory cache. Examples -------- >>> from pyproteome import utils >>> @utils.memoize ... def download_data(species): ... ... # Fetch / calculate the return value once Parameters ---------- func : func Returns ------- memorized : func ''' cache = func.cache = {} @functools.wraps(func) def memoized_func(*args, **kwargs): key = str(args) + str(kwargs) if key not in cache: cache[key] = func(*args, **kwargs) return cache[key] return memoized_func
PICKLE_DIR = '.pyproteome' ''' Default directory to use for saving / loading pickle files. '''
[docs]def save(name, val=None): ''' Save a variable using the pickle module. Parameters ---------- name : str The name to use for data storage. val : object, optional Returns ------- val : object ''' filename = os.path.join(PICKLE_DIR, '{}.pkl'.format(name)) makedirs(PICKLE_DIR) with open(filename, 'wb') as f: pickle.dump(val, f) return val
[docs]def load(name, default=None): ''' Load a variable using the pickle module. Parameters ---------- name : str The name to use for data storage. default : object, optional Returns ------- val : object ''' filename = os.path.join(PICKLE_DIR, '{}.pkl'.format(name)) try: with open(filename, 'rb') as f: val = pickle.load(f) except ( OSError, pickle.UnpicklingError, IOError, AttributeError, EOFError, ImportError, IndexError, ): val = default return val
[docs]def adjust_text(*args, **kwargs): ''' Wraps importing and calling :func:`adjustText.adjust_text`. ''' from adjustText import adjust_text as at return at(*args, **kwargs)
[docs]def get_name(proteins): ''' Generates a shortened version of a protein name. For peptides that map to multiple proteins, this function finds the longest common prefix (excluding digits) that matches all proteins. Parameters ---------- proteins : :class:`.data_sets.protein.Proteins` Returns ------- str Examples -------- >>> pyp.utils.get_name( ... protein.Proteins([ ... protein.Protein(gene='Dpysl2'), ... protein.Protein(gene='Dpysl3'), ... ]) ... ) 'Dpysl2/3' >>> pyp.utils.get_name( ... protein.Proteins([ ... protein.Protein(gene='Src'), ... protein.Protein(gene='Fgr'), ... protein.Protein(gene='Fyn'), ... ]) ... ) 'Src / Fgr / Fyn' >>> pyp.utils.get_name( ... protein.Proteins([ ... protein.Protein(gene='Tuba1a'), ... protein.Protein(gene='Tuba1b'), ... protein.Protein(gene='Tuba1c'), ... protein.Protein(gene='Tuba4a'), ... protein.Protein(gene='Tuba8'), ... ]) ... ) 'Tuba1a/1b/1c/3a/4a/8' ''' genes = sorted(proteins.genes) common = '' sep = ' / ' if len(genes) > 1: common = os.path.commonprefix(genes) last_digit = [ ind for ind, i in list(enumerate(common))[::-1] if i.isdigit() ] if last_digit: common = common[:last_digit[0]] if common: sep = '/' return common + sep.join(i[len(common):] for i in genes)
[docs]def stars(p, ns='ns'): ''' Calculate the stars to indicate significant changes. \\*\\*\\*\\* : p < 1e-4 \\*\\*\\* : p < 1e-3 \\*\\* : p < 1e-2 \\* : p < 5e-2 ns : not significant Parameters ---------- p : float ns : str, optional Returns ------- str ''' if p < 1e-4: return '****' elif (p < 1e-3): return '***' elif (p < 1e-2): return '**' elif (p < 0.05): return '*' else: return ns