Source code for pyproteome.utils

'''Utility functions used in other modules.'''

# Built-ins
from collections import OrderedDict, Callable
import copy
import difflib
import functools
import os
import pickle
import types

import numpy as np
import pandas as pd

from . import paths


DEFAULT_DPI = 300
'''
The DPI to use when generating all image figures.
'''


[docs]def fuzzy_find(needle, haystack):
    '''
    Find the longest matching subsequence of needle within haystack.

    Returns the corresponding index from the beginning of needle.

    Parameters
    ----------
    needle : str
    haystack : str

    Returns
    -------
    index : int
    '''
    s = difflib.SequenceMatcher(a=haystack, b=needle)
    best = s.find_longest_match(0, len(haystack), 0, len(needle))
    return best.a - len(needle) + best.size


[docs]def make_folder(data=None, folder_name=None, sub='Output'):
    if folder_name is None:
        folder_name = os.path.join(
            paths.FIGURES_DIR,
            data.name
            if data is not None else
            'All',
            sub,
        )

    return makedirs(folder_name)


[docs]def makedirs(folder_name=None):
    '''
    Creates a folder if it does not exist.

    Parameters
    ----------
    folder_name : str, optional

    Returns
    -------
    folder_name : str
    '''
    if folder_name:
        try:
            os.makedirs(folder_name)
        except OSError:
            pass

    return folder_name


[docs]def norm(channels):
    '''
    Converts a list of channels to their normalized names.

    Parameters
    ----------
    channels : list of str or dict of (str, str) or None

    Returns
    -------
    new_channels : list of str or dict of str, str
    '''
    if channels is None:
        return None

    if isinstance(channels, str):
        return channels + '_norm'

    if isinstance(channels, list):
        return [norm(i) for i in channels]

    if isinstance(channels, (dict, OrderedDict)):
        return OrderedDict(
            (key, norm(val))
            for key, val in channels.items()
        )


[docs]def which(program):
    '''
    Checks if a program exists in PATH's list of directories.

    Parameters
    ----------
    program : str

    Returns
    -------
    path : str or None
    '''
    def is_exe(fpath):
        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

    fpath, _ = os.path.split(program)

    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ['PATH'].split(os.pathsep):
            path = path.strip('\'')
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file

    return None


[docs]def flatten_set(lst):
    '''
    Flattens an Iterable with arbitrary nesting into a single set.

    Parameters
    ----------
    lst : Iterable

    Returns
    -------
    flattened : set

    Examples
    --------
        >>> utils.flatten_set([0, [1, 2], [[3]], 'string'])
        set([0, 1, 2, 3, 'string'])
    '''
    if isinstance(
        lst,
        (list, tuple, set, types.GeneratorType, pd.Series, np.ndarray)
    ):
        ret = set()

        for element in lst:
            for new_element in flatten_set(element):
                ret.add(new_element)

        return ret

    return set([lst])


[docs]def flatten_list(lst):
    '''
    Flattens an Iterable with arbitrary nesting into a single list.

    Parameters
    ----------
    lst : Iterable

    Returns
    -------
    flattened : list

    Examples
    --------
        >>> utils.flatten_list([0, [1, 2], [[3]], 'string'])
        [0, 1, 2, 3, 'string']
    '''
    if isinstance(
        lst,
        (list, tuple, set, types.GeneratorType, pd.Series, np.ndarray)
    ):
        ret = []

        for element in lst:
            for new_element in flatten_list(element):
                ret.append(new_element)

        return ret

    return [lst]


[docs]class DefaultOrderedDict(OrderedDict):
    # Source: http://stackoverflow.com/a/6190500/562769
    def __init__(self, default_factory=None, *a, **kw):
        if (default_factory is not None and
           not isinstance(default_factory, Callable)):
            raise TypeError('first argument must be callable')
        OrderedDict.__init__(self, *a, **kw)
        self.default_factory = default_factory

    def __getitem__(self, key):
        try:
            return OrderedDict.__getitem__(self, key)
        except KeyError:
            return self.__missing__(key)

    def __missing__(self, key):
        if self.default_factory is None:
            raise KeyError(key)
        self[key] = value = self.default_factory()
        return value

    def __reduce__(self):
        if self.default_factory is None:
            args = tuple()
        else:
            args = self.default_factory,
        return type(self), args, None, None, self.items()

[docs]    def copy(self):
        return self.__copy__()

    def __copy__(self):
        return type(self)(self.default_factory, self)

    def __deepcopy__(self, memo):
        return type(self)(self.default_factory,
                          copy.deepcopy(self.items()))

    def __repr__(self):
        return 'OrderedDefaultDict(%s, %s)' % (
            self.default_factory,
            OrderedDict.__repr__(self),
        )


[docs]def memoize(func):
    '''
    Memoize a function, saving its returned value for a given set of parameters
    in an in-memory cache.

    Examples
    --------
    >>> from pyproteome import utils
    >>> @utils.memoize
    ... def download_data(species):
    ...    ...  # Fetch / calculate the return value once


    Parameters
    ----------
    func : func

    Returns
    -------
    memorized : func
    '''
    cache = func.cache = {}

    @functools.wraps(func)
    def memoized_func(*args, **kwargs):
        key = str(args) + str(kwargs)

        if key not in cache:
            cache[key] = func(*args, **kwargs)

        return cache[key]

    return memoized_func


PICKLE_DIR = '.pyproteome'
'''
Default directory to use for saving / loading pickle files.
'''


[docs]def save(name, val=None):
    '''
    Save a variable using the pickle module.

    Parameters
    ----------
    name : str
        The name to use for data storage.
    val : object, optional

    Returns
    -------
    val : object
    '''
    filename = os.path.join(PICKLE_DIR, '{}.pkl'.format(name))

    makedirs(PICKLE_DIR)

    with open(filename, 'wb') as f:
        pickle.dump(val, f)

    return val


[docs]def load(name, default=None):
    '''
    Load a variable using the pickle module.

    Parameters
    ----------
    name : str
        The name to use for data storage.
    default : object, optional

    Returns
    -------
    val : object
    '''
    filename = os.path.join(PICKLE_DIR, '{}.pkl'.format(name))

    try:
        with open(filename, 'rb') as f:
            val = pickle.load(f)
    except (
        OSError, pickle.UnpicklingError, IOError,
        AttributeError, EOFError, ImportError, IndexError,
    ):
        val = default

    return val


[docs]def adjust_text(*args, **kwargs):
    '''
    Wraps importing and calling :func:`adjustText.adjust_text`.
    '''
    from adjustText import adjust_text as at
    return at(*args, **kwargs)


[docs]def get_name(proteins):
    '''
    Generates a shortened version of a protein name. For peptides
    that map to multiple proteins, this function finds the longest
    common prefix (excluding digits) that matches all proteins.

    Parameters
    ----------
    proteins : :class:`.data_sets.protein.Proteins`

    Returns
    -------
    str

    Examples
    --------
    >>> pyp.utils.get_name(
    ...     protein.Proteins([
    ...         protein.Protein(gene='Dpysl2'),
    ...         protein.Protein(gene='Dpysl3'),
    ...     ])
    ... )
    'Dpysl2/3'
    >>> pyp.utils.get_name(
    ...     protein.Proteins([
    ...         protein.Protein(gene='Src'),
    ...         protein.Protein(gene='Fgr'),
    ...         protein.Protein(gene='Fyn'),
    ...     ])
    ... )
    'Src / Fgr / Fyn'
    >>> pyp.utils.get_name(
    ...     protein.Proteins([
    ...         protein.Protein(gene='Tuba1a'),
    ...         protein.Protein(gene='Tuba1b'),
    ...         protein.Protein(gene='Tuba1c'),
    ...         protein.Protein(gene='Tuba4a'),
    ...         protein.Protein(gene='Tuba8'),
    ...     ])
    ... )
    'Tuba1a/1b/1c/3a/4a/8'
    '''
    genes = sorted(proteins.genes)
    common = ''
    sep = ' / '

    if len(genes) > 1:
        common = os.path.commonprefix(genes)
        last_digit = [
            ind
            for ind, i in list(enumerate(common))[::-1]
            if i.isdigit()
        ]
        
        if last_digit:
            common = common[:last_digit[0]]

        if common:
            sep = '/'

    return common + sep.join(i[len(common):] for i in genes)


[docs]def stars(p, ns='ns'):
    '''
    Calculate the stars to indicate significant changes.

    \\*\\*\\*\\* : p < 1e-4
    
    \\*\\*\\* : p < 1e-3
    
    \\*\\* : p < 1e-2
    
    \\* : p < 5e-2
    
    ns : not significant

    Parameters
    ----------
    p : float
    ns : str, optional

    Returns
    -------
    str
    '''
    if p < 1e-4:
        return '****'
    elif (p < 1e-3):
        return '***'
    elif (p < 1e-2):
        return '**'
    elif (p < 0.05):
        return '*'
    else:
        return ns