Source code for pyproteome.discoverer

'''
This module provides functionality for accessing searched data from Proteome
Discoverer.
'''

from collections import defaultdict
import logging
import os
import re
import sqlite3
import sys
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd

from pyproteome import (
    data_sets, paths, pypuniprot,
)


LOGGER = logging.getLogger('pyproteome.discoverer')
RE_PSP = re.compile(
    r'(\w+)\((\d+)\): ([\d\.]+)'
)
RE_GENE = re.compile(
    r'^'
    r'.*'
    r' GN=([A-Za-z0-9_\-]+)( \w+)?'
)
RE_GENE_BACKUP = re.compile(
    r'^'
    r'(gi\|[\dA-Za-z]+\|)?'
    r'(uc\|(([\dA-Za-z]+)\|)?([\dA-Za-z\.]+\|)?)?'
    r'(ref\|([\dA-Za-z\._]+)\|)?'
    r'(gb\|([\dA-Za-z\._]+)\|)?'
    r'(gnl\|[\dA-Za-z]+\|)?'
    r'(sp\|[\dA-Za-z\-]+\|)?'
    r' ?([\dA-Za-z_\:\-]+) ')
RE_DESCRIPTION = re.compile(
    r'^'
    r'(gi\|[\dA-Za-z]+\|)?'
    r'(uc\|(([\dA-Za-z]+)\|)?([\dA-Za-z\.]+\|)?)?'
    r'(ref\|([\dA-Za-z\._]+)\|)?'
    r'(gb\|([\dA-Za-z\._]+)\|)?'
    r'(gnl\|[\dA-Za-z]+\|)?'
    r'(sp\|[\dA-Za-z\-]+\|)?'
    r' ?[\dA-Za-z_\-\:]+ (.+?)( OS=| GN=| PE=| SV=| \[|$)'
)
CONFIDENCE_MAPPING = {1: 'Low', 2: 'Medium', 3: 'High'}
PD1_SUPPORTED_VERSIONS = [(1, 4)]
PD2_SUPPORTED_VERSIONS = [(2, 2), (2, 3), (2, 4), (2, 5)]


def _get_pd_version(cursor):
    query = cursor.execute(
        '''
        SELECT
        SoftwareVersion

        FROM SchemaInfo

        WHERE Kind=='Result'
        ''',
    )
    return tuple([int(i) for i in next(query)[0].split('.')])


def _update_label_names(cursor, pd_version):
    if pd_version[:2] in PD1_SUPPORTED_VERSIONS:
        fields = cursor.execute(
            '''
            SELECT
            AminoAcidModifications.Abbreviation,
            AminoAcidModifications.ModificationName,
            AminoAcids.AminoAcidName,
            AminoAcids.OneLetterCode

            FROM AminoAcidModifications

            JOIN AminoAcidModificationsAminoAcids
            ON AminoAcidModifications.AminoAcidModificationID=
            AminoAcidModificationsAminoAcids.AminoAcidModificationID

            JOIN AminoAcids
            ON AminoAcids.AminoAcidID=
            AminoAcidModificationsAminoAcids.AminoAcidID

            WHERE AminoAcidModifications.isActive
            ''',
        )
    elif pd_version[:2] in PD2_SUPPORTED_VERSIONS:
        fields = cursor.execute(
            '''
            SELECT
            FoundModifications.Abbreviation,
            FoundModifications.Name,
            AminoAcids.Name,
            AminoAcids.OneLetterCode

            FROM FoundModifications

            JOIN FoundModificationsAminoAcids
            ON FoundModifications.ModificationID=
            FoundModificationsAminoAcids.FoundModificationsModificationID

            JOIN AminoAcids
            ON AminoAcids.AminoAcidID=
            FoundModificationsAminoAcids.AminoAcidsAminoAcidID
            ''',
        )
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    for abbrev, mod_name, aa_name, letter in fields:
        if any(
            i in abbrev or i in mod_name
            for i in data_sets.modification.LABEL_NAME_TARGETS
        ):
            if aa_name in ['N-Terminus']:
                letter = 'N-term'
            elif aa_name in ['C-Terminus']:
                letter = 'C-term'

            data_sets.modification.LABEL_NAMES[abbrev].add(letter)


def _get_quant_tags(cursor, pd_version):
    if pd_version[:2] in PD1_SUPPORTED_VERSIONS:
        quantification = cursor.execute(
            '''
            SELECT
            ParameterValue

            FROM ProcessingNodeParameters

            WHERE ProcessingNodeParameters.ParameterName='QuantificationMethod'
            ''',
        ).fetchone()

        if quantification:
            quantification = quantification[0]

            if sys.version_info.major < 3:
                quantification = quantification.encode('utf-8')

            root = ET.fromstring(quantification)
            quant_tags = root.findall(
                'MethodPart/MethodPart/Parameter[@name=\'TagName\']',
            )
            tag_names = [i.text for i in quant_tags]
        else:
            tag_names = None
    elif pd_version[:2] in PD2_SUPPORTED_VERSIONS:
        quantification = cursor.execute(
            '''
            SELECT
            AnalysisDefinitionXML

            FROM AnalysisDefinition
            ''',
        ).fetchone()

        if quantification:
            quantification = quantification[0]

            if sys.version_info.major < 3:
                quantification = quantification.encode('utf-8')

            root = ET.fromstring(quantification)
            quant_tags = root.findall(
                'StudyDefinition/QuanMethods/'
                'QuanMethod/QuanChannels/QuanChannel',
            )
            tag_names = [i.get('Name') for i in quant_tags]
        else:
            tag_names = None
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    return tag_names


def _read_peptides(conn, pd_version, pick_best_psm=False):
    if pd_version[:2] in PD1_SUPPORTED_VERSIONS:
        sql_query = '''
        SELECT
        Peptides.PeptideID,
        Peptides.SpectrumID,
        Peptides.Sequence,
        Peptides.SearchEngineRank AS 'Rank',
        Peptides.ConfidenceLevel AS 'Confidence Level',
        Peptides.MissedCleavages AS 'Missed Cleavages',
        PeptideScores.ScoreValue AS 'Ion Score',
        SpectrumHeaders.FirstScan AS 'Scan',
        SpectrumHeaders.LastScan AS 'Last Scan',
        FileInfos.FileName AS 'Spectrum File',
        MassPeaks.PercentIsolationInterference AS 'Isolation Interference'

        FROM Peptides

        JOIN PeptideScores
        ON Peptides.PeptideID=PeptideScores.PeptideID

        JOIN SpectrumHeaders
        ON Peptides.SpectrumID=SpectrumHeaders.SpectrumID

        JOIN FileInfos
        ON FileInfos.FileID=MassPeaks.FileID

        JOIN Masspeaks
        ON Masspeaks.MassPeakID=SpectrumHeaders.MassPeakID
        ''' + (
            '''
            WHERE Peptides.SearchEngineRank=1
            '''
            if pick_best_psm else
            ''
        )
    elif pd_version[:2] in PD2_SUPPORTED_VERSIONS:
        columns = [i[1] for i in conn.cursor().execute('PRAGMA table_info(TargetPsms)')]
        sql_query = (
            '''
            SELECT
            TargetPsms.PeptideID,
            TargetPsmsMSnSpectrumInfo.MSnSpectrumInfoSpectrumID,
            TargetPsms.Sequence,
            TargetPsms.SearchEngineRank AS 'Rank',
            TargetPsms.MissedCleavages AS 'Missed Cleavages',
            {0}
            TargetPsms.FirstScan AS 'Scan',
            TargetPsms.LastScan AS 'Last Scan',
            WorkflowInputFiles.FileName AS 'Spectrum File',
            TargetPsms.PercentIsolationInterference AS 'Isolation Interference'

            FROM TargetPsms

            JOIN TargetPsmsMSnSpectrumInfo
            ON TargetPsmsMSnSpectrumInfo.TargetPsmsPeptideID=TargetPsms.PeptideID

            JOIN WorkflowInputFiles
            ON WorkflowInputFiles.FileID=TargetPsms.SpectrumFileID
            ''' + (
                '''
                WHERE TargetPsms.SearchEngineRank=1
                '''
                if pick_best_psm else
                ''
            )
        ).format(
            "TargetPsms.IonsScore AS 'Ion Score'," if 'IonsScore' in columns else ''
        )
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    df = pd.read_sql_query(
        sql=sql_query,
        con=conn,
        index_col='PeptideID',
    )

    if 'Confidence Level' not in df.columns:
        # XXX: Hacked on!
        df['Confidence Level'] = 3

    if 'Ion Score' not in df.columns:
        df['Ion Score'] = np.nan

    return df


def _extract_sequence(df):
    if df.shape[0] < 1:
        return df

    df['Sequence'] = df.apply(
        lambda row:
        data_sets.extract_sequence(
            row['Proteins'],
            row['Sequence'],
        ),
        axis=1,
    )

    return df


def _extract_confidence(df):
    df['Confidence Level'] = df['Confidence Level'].apply(
        lambda x: CONFIDENCE_MAPPING[x]
    )

    return df


def _extract_spectrum_file(df):
    # 'path/to/file.ext' => 'file.ext'
    df['Spectrum File'] = df['Spectrum File'].apply(
        lambda x: os.path.split(x)[1]
    )

    return df


def _get_proteins(df, cursor, pd_version):
    if pd_version[:2] in PD1_SUPPORTED_VERSIONS:
        prots = cursor.execute(
            '''
            SELECT
            Peptides.PeptideID,
            ProteinAnnotations.Description,
            Proteins.Sequence

            FROM Peptides

            JOIN PeptidesProteins
            ON Peptides.PeptideID=PeptidesProteins.PeptideID

            JOIN ProteinAnnotations
            ON ProteinAnnotations.ProteinID=PeptidesProteins.ProteinID

            JOIN Proteins
            ON Proteins.ProteinID=PeptidesProteins.ProteinID
            ''',
        )
    elif pd_version[:2] in PD2_SUPPORTED_VERSIONS:
        prots = cursor.execute(
            '''
            SELECT
            TargetPsms.PeptideID,
            TargetProteins.FastaTitleLines,
            TargetProteins.Sequence

            FROM TargetPsms

            JOIN TargetProteinsTargetPsms
            ON TargetPsms.PeptideID=
            TargetProteinsTargetPsms.TargetPsmsPeptideID

            JOIN TargetProteins
            ON TargetProteins.UniqueSequenceID=
            TargetProteinsTargetPsms.TargetProteinsUniqueSequenceID
            ''',
        )
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    accessions = defaultdict(list)
    genes = defaultdict(list)
    descriptions = defaultdict(list)
    sequences = defaultdict(list)

    for peptide_id, prot_string, seq in prots:
        if prot_string.startswith('>'):
            prot_string = prot_string[1:]
        
        for fasta_line in prot_string.split('\n'):
            if fasta_line.startswith('>'):
                fasta_line = fasta_line[1:]
            
            try:
                matches = pypuniprot.RE_DISCOVERER_ACCESSION.match(fasta_line)
                acc = matches.group(13) or matches.group(5) or matches.group(14)
                accessions[peptide_id].append(acc)
                if not acc:
                    print(acc)
                    print(fasta_line, accessions[peptide_id][-1], matches.groups())
            except:
                print(fasta_line)
                raise

            try:
                gene = RE_GENE.match(prot_string)

                if gene:
                    gene = gene.group(1)
                else:
                    matches = RE_GENE_BACKUP.match(prot_string)
                    gene = matches.group(12)
                    # print(prot_string, gene, matches.groups())
            except:
                print(prot_string)
                raise

            genes[peptide_id].append(
                gene
            )
            try:
                matches = RE_DESCRIPTION.match(prot_string)
                desc = RE_DESCRIPTION.match(prot_string)
                desc = desc.group(12)
            except:
                print(prot_string)
                raise

            descriptions[peptide_id].append(desc)
            sequences[peptide_id].append(seq)

    df['Protein Descriptions'] = df.index.map(
        lambda peptide_id:
        '; '.join(descriptions[peptide_id])
    )

    df['Protein Group Accessions'] = df.index.map(
        lambda peptide_id:
        '; '.join(accessions[peptide_id])
    )

    df['Proteins'] = df.index.map(
        lambda peptide_id:
        data_sets.Proteins(
            proteins=tuple(
                data_sets.Protein(
                    accession=accession,
                    gene=gene,
                    full_sequence=seq,
                    description=desc,
                )
                for accession, gene, seq, desc in zip(
                    accessions[peptide_id],
                    genes[peptide_id],
                    sequences[peptide_id],
                    descriptions[peptide_id],
                )
            )
        )
    )

    return df


def _get_modifications(df, cursor, pd_version):
    mod_dict = defaultdict(list)

    if pd_version[:2] in PD1_SUPPORTED_VERSIONS:
        aa_mods = cursor.execute(
            '''
            SELECT
            Peptides.PeptideID,
            AminoAcidModifications.Abbreviation,
            PeptidesAminoAcidModifications.Position

            FROM Peptides

            JOIN PeptidesAminoAcidModifications
            ON Peptides.PeptideID=PeptidesAminoAcidModifications.PeptideID

            JOIN AminoAcidModifications
            ON PeptidesAminoAcidModifications.AminoAcidModificationID=
            AminoAcidModifications.AminoAcidModificationID
            ''',
        )

        for peptide_id, name, pos in aa_mods:
            if peptide_id not in df.index:
                continue

            mod = data_sets.Modification(
                rel_pos=pos,
                mod_type=name,
                nterm=False,
                cterm=False,
            )

            mod_dict[peptide_id].append(mod)

        term_mods = cursor.execute(
            '''
            SELECT
            Peptides.PeptideID,
            Peptides.Sequence,
            AminoAcidModifications.Abbreviation,
            AminoAcidModifications.PositionType

            FROM Peptides

            JOIN PeptidesTerminalModifications
            ON Peptides.PeptideID=PeptidesTerminalModifications.PeptideID

            JOIN AminoAcidModifications
            ON PeptidesTerminalModifications.TerminalModificationID=
            AminoAcidModifications.AminoAcidModificationID
            ''',
        )

        # PositionType rules taken from:
        #
        # https://github.com/compomics/thermo-msf-parser/blob/
        # 697a2fe94de2e960a9bb962d1f263dc983461999/thermo_msf_parser_API/
        # src/main/java/com/compomics/thermo_msf_parser_API/highmeminstance/
        # Parser.java#L1022
        for peptide_id, pep_seq, name, pos_type in term_mods:
            if peptide_id not in df.index:
                continue

            nterm = pos_type == 1
            pos = 0 if nterm else len(pep_seq)

            mod = data_sets.Modification(
                rel_pos=pos,
                mod_type=name,
                nterm=nterm,
                cterm=not nterm,
            )
            mod_dict[peptide_id].append(mod)
    elif pd_version[:2] in PD2_SUPPORTED_VERSIONS:
        aa_mods = cursor.execute(
            '''
            SELECT
            TargetPsms.PeptideID,
            FoundModifications.Abbreviation,
            TargetPsmsFoundModifications.Position

            FROM TargetPsms

            JOIN TargetPsmsFoundModifications
            ON
            TargetPsmsFoundModifications.TargetPsmsPeptideID=TargetPsms.PeptideID

            JOIN FoundModifications
            ON
            TargetPsmsFoundModifications.FoundModificationsModificationID=
            FoundModifications.ModificationID

            WHERE
            FoundModifications.PositionType NOT IN (1, 2)
            ''',
        )

        for peptide_id, name, pos in aa_mods:
            if peptide_id not in df.index:
                continue

            pos -= 1

            mod = data_sets.Modification(
                rel_pos=pos,
                mod_type=name,
                nterm=False,
                cterm=False,
            )

            mod_dict[peptide_id].append(mod)

        term_mods = cursor.execute(
            '''
            SELECT
            TargetPsms.PeptideID,
            TargetPsms.Sequence,
            FoundModifications.Abbreviation,
            FoundModifications.PositionType

            FROM TargetPsms

            JOIN TargetPsmsFoundModifications
            ON
            TargetPsmsFoundModifications.TargetPsmsPeptideID=TargetPsms.PeptideID

            JOIN FoundModifications
            ON
            TargetPsmsFoundModifications.FoundModificationsModificationID=
            FoundModifications.ModificationID

            WHERE
            FoundModifications.PositionType IN (1, 2)
            ''',
        )

        # PositionType rules taken from:
        #
        # https://github.com/compomics/thermo-msf-parser/blob/
        # 697a2fe94de2e960a9bb962d1f263dc983461999/thermo_msf_parser_API/
        # src/main/java/com/compomics/thermo_msf_parser_API/highmeminstance/
        # Parser.java#L1022
        for peptide_id, pep_seq, name, pos_type in term_mods:
            if peptide_id not in df.index:
                continue

            nterm = pos_type == 1
            pos = 0 if nterm else len(pep_seq)

            mod = data_sets.Modification(
                rel_pos=pos,
                mod_type=name,
                nterm=nterm,
                cterm=not nterm,
            )
            mod_dict[peptide_id].append(mod)
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    mod_dict = {
        key: _sort_mods(val)
        for key, val in mod_dict.items()
    }

    def _get_mods(row):
        peptide_id = row.name

        mods = data_sets.Modifications(
            mods=mod_dict.get(peptide_id, tuple()),
        )

        for mod in mods.mods:
            assert mod.sequence is None
            mod.sequence = row['Sequence']

        row['Sequence'].modifications = mods

        return mods

    df['Modifications'] = df.apply(_get_mods, axis=1)

    return df

def _get_unlabeled_quantifications(df, cursor, pd_version, tag_names):
    if pd_version[:2] in PD1_SUPPORTED_VERSIONS:
        return df
    elif pd_version[:2] in PD2_SUPPORTED_VERSIONS:
        try:
            vals = cursor.execute(
                '''
                SELECT
                TargetPsms.PeptideID,
                LcmsFeatures.FileDisplayId,
                LcmsFeatures.Area

                FROM LcmsFeatures

                INNER JOIN LcmsFeaturesTargetPsms
                ON LcmsFeaturesTargetPsms.LcmsFeaturesId=LcmsFeatures.Id

                INNER JOIN TargetPsms
                ON TargetPsms.PeptideID=
                LcmsFeaturesTargetPsms.TargetPsmsPeptideID
                ''',
            )
        except Exception as err:
            print(err)
            LOGGER.warn('No unlabeled quantification available')
            return df
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    
    mapping = {
        (peptide_id, channel_id): height
        for peptide_id, channel_id, height in vals
        if peptide_id in df.index
    }

    channel_ids = sorted(set(i[1] for i in mapping.keys()))
    col_names = channel_ids

    def _get_quants(row):
        peptide_id = row.name

        vals = [
            mapping.get((peptide_id, channel_id), np.nan)
            for channel_id in channel_ids
        ]

        # Convert very low ion counts to nan
        vals = [
            np.nan if val <= 1 else val
            for val in vals
        ]

        return pd.Series(vals, index=col_names)

    df[col_names] = df.apply(
        _get_quants,
        axis=1,
    )
    display(df)

    return df


def _get_quantifications(df, cursor, pd_version, tag_names):
    if not tag_names:
        return _get_unlabeled_quantifications(df, cursor, pd_version, tag_names)

    # XXX: Bug: Peak heights do not exactly match those from Discoverer

    if pd_version[:2] in PD1_SUPPORTED_VERSIONS:
        vals = cursor.execute(
            '''
            SELECT
            Peptides.PeptideID,
            ReporterIonQuanResults.QuanChannelID,
            ReporterIonQuanResults.Height

            FROM Peptides

            JOIN ReporterIonQuanResults
            ON Peptides.SpectrumID=
            ReporterIonQuanResultsSearchSpectra.SearchSpectrumID

            JOIN ReporterIonQuanResultsSearchSpectra
            ON ReporterIonQuanResultsSearchSpectra.SpectrumID=
            ReporterIonQuanResults.SpectrumID
            ''',
        )
    elif pd_version[:2] in PD2_SUPPORTED_VERSIONS:
        vals = cursor.execute(
            '''
            SELECT
            TargetPsms.PeptideID,
            ReporterQuanResults.QuanChannelID,
            ReporterQuanResults.Height

            FROM ReporterQuanResults

            INNER JOIN QuanSpectrumInfoTargetPsms
            ON QuanSpectrumInfoTargetPsms.QuanSpectrumInfoSpectrumID=
            ReporterQuanResults.QuanResultID

            INNER JOIN QuanSpectrumInfo
            ON QuanSpectrumInfo.SpectrumID=ReporterQuanResults.QuanResultID

            INNER JOIN TargetPsms
            ON TargetPsms.PeptideID=
            QuanSpectrumInfoTargetPsms.TargetPsmsPeptideID
            ''',
        )
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    mapping = {
        (peptide_id, channel_id): height
        for peptide_id, channel_id, height in vals
        if peptide_id in df.index
    }

    channel_ids = sorted(set(i[1] for i in mapping.keys()))

    col_names = [tag_names[channel_id - 1] for channel_id in channel_ids]

    def _get_quants(row):
        peptide_id = row.name

        vals = [
            mapping.get((peptide_id, channel_id), np.nan)
            for channel_id in channel_ids
        ]

        # Convert very low ion counts and unlabeled peptides to nan
        vals = [
            np.nan if val <= 1 else val
            for val in vals
        ]

        if not row['Sequence'].is_labeled:
            vals = [np.nan] * len(vals)

        return pd.Series(vals, index=col_names)

    df[col_names] = df.apply(
        _get_quants,
        axis=1,
    )

    return df


def _get_ms_data(df, cursor, pd_version):
    if pd_version[:2] in PD1_SUPPORTED_VERSIONS:
        vals = cursor.execute(
            '''
            SELECT
            Peptides.PeptideID,
            SpectrumHeaders.Charge,
            SpectrumHeaders.Mass,
            SpectrumHeaders.RetentionTime,
            MassPeaks.Intensity

            FROM Peptides

            JOIN SpectrumHeaders
            ON Peptides.SpectrumID=SpectrumHeaders.SpectrumID

            JOIN MassPeaks
            ON MassPeaks.MassPeakID=SpectrumHeaders.MassPeakID
            ''',
        )
    elif pd_version[:2] in PD2_SUPPORTED_VERSIONS:
        vals = cursor.execute(
            '''
            SELECT
            TargetPsms.PeptideID,
            TargetPsms.Charge,
            TargetPsms.Mass,
            TargetPsms.RetentionTime,
            TargetPsms.Intensity

            FROM TargetPsms
            ''',
        )
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    mapping = {
        index: (charge, mass, rt, i)
        for index, charge, mass, rt, i in vals
    }
    charge_mapping = {
        key: {val[0]}
        for key, val in mapping.items()
    }
    mass_mapping = {
        key: {val[1]}
        for key, val in mapping.items()
    }
    rt_mapping = {
        key: {val[2]}
        for key, val in mapping.items()
    }
    i_mapping = {
        key: {val[3]}
        for key, val in mapping.items()
    }

    df['Charges'] = df.index.map(
        lambda peptide_id:
        charge_mapping.get(peptide_id, set()),
    )
    df['Masses'] = df.index.map(
        lambda peptide_id:
        mass_mapping.get(peptide_id, set()),
    )
    df['RTs'] = df.index.map(
        lambda peptide_id:
        rt_mapping.get(peptide_id, set()),
    )
    df['Intensities'] = df.index.map(
        lambda peptide_id:
        i_mapping.get(peptide_id, set()),
    )

    return df


def _set_defaults(df):
    df['Validated'] = False
    df['Fold Change'] = np.nan
    df['p-value'] = np.nan

    return df


def _get_filenames(df, cursor, pd_version):
    if pd_version[:2] in PD1_SUPPORTED_VERSIONS:
        files = cursor.execute(
            '''
            SELECT
            Peptides.PeptideID,
            FileInfos.PhysicalFileName

            FROM Peptides

            JOIN SpectrumHeaders
            ON Peptides.SpectrumID=SpectrumHeaders.SpectrumID

            JOIN MassPeaks
            ON MassPeaks.MassPeakID=SpectrumHeaders.MassPeakID

            JOIN FileInfos
            ON FileInfos.FileID=MassPeaks.FileID
            ''',
        )
    elif pd_version[:2] in PD2_SUPPORTED_VERSIONS:
        files = cursor.execute(
            '''
            SELECT
            TargetPsms.PeptideID,
            WorkflowInputFiles.PhysicalFileName

            FROM TargetPsms

            JOIN WorkflowInputFiles
            ON WorkflowInputFiles.FileID=TargetPsms.SpectrumFileID
            ''',
        )
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    mapping = {
        index: os.path.split(val)[-1]
        for index, val in files
    }

    df['Raw Paths'] = df.index.map(
        lambda peptide_id:
        mapping.get(peptide_id, {})
    )

    return df


def _get_q_values(df, cursor, pd_version):
    df['q-value'] = np.nan
    q_vals = []

    if pd_version[:2] in PD1_SUPPORTED_VERSIONS:
        fields = cursor.execute(
            '''
            SELECT
            CustomDataFields.FieldID,
            CustomDataFields.DisplayName

            FROM CustomDataFields
            ''',
        )

        field_ids = [
            field_id
            for field_id, name in fields
            if name in ['q-Value']
        ]

        if not field_ids:
            return df

        q_vals = cursor.execute(
            '''
            SELECT
            CustomDataPeptides.PeptideID,
            CustomDataPeptides.FieldValue

            FROM CustomDataPeptides

            WHERE CustomDataPeptides.FieldID IN ({})
            '''.format(
                ', '.join('?' * len(field_ids))
            ),
            field_ids,
        )
    elif pd_version[:2] in PD2_SUPPORTED_VERSIONS:
        try:
            q_vals = cursor.execute(
                '''
                SELECT
                TargetPsms.PeptideID,
                TargetPsms.PercolatorqValue

                FROM TargetPsms
                ''',
            )
        except sqlite3.OperationalError:
            pass
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    if q_vals:
        indices, vals = zip(*[
            (index, val)
            for index, val in q_vals
            if index in df.index
        ])

        df.at[indices, 'q-value'] = vals

    return df


def _is_pmod(mod):
    return (
        not mod.nterm and
        not mod.cterm and
        mod.mod_type in ['Phospho']
    )


def _sort_mods(mods):
    return tuple(
        sorted(
            mods,
            key=lambda x: (x.rel_pos, x.nterm, x.cterm, x.mod_type),
        )
    )


def _reassign_mods(mods, psp_val, probability_cutoff=75):
    reassigned = False
    ambiguous = False

    # phophoRS example format: 'T(4): 99.6; S(6): 0.4; S(10): 0.0'
    # Error messages include: 'Too many isoforms'
    if psp_val is None:
        psp_val = ''

    psp_val = [
        RE_PSP.match(i.strip())
        for i in psp_val.split(';')
    ]
    psp_val = [
        i.groups()
        for i in psp_val
        if i
    ]
    psp_val = [
        (i[0], int(i[1]), float(i[2]))
        for i in psp_val
    ]

    o_mods = [i for i in mods if not _is_pmod(i)]
    p_mods = [i for i in mods if _is_pmod(i)]
    psp_val_f = [i for i in psp_val if i[2] > probability_cutoff]

    if len(p_mods) != len(psp_val_f):
        LOGGER.debug(
            'Not enough info to assign phophosite: {}'.format(psp_val)
        )
        ambiguous = True
    elif set(i.rel_pos + 1 for i in p_mods) != set(i[1] for i in psp_val_f):
        p_mods = [
            data_sets.Modification(
                rel_pos=i[1] - 1,
                mod_type='Phospho',
                nterm=False,
                cterm=False,
                sequence=p_mods[0].sequence,
            )
            for i in psp_val_f
        ]
        reassigned = True

        mods = data_sets.Modifications(
            mods=_sort_mods(o_mods + p_mods),
        )

        for mod in mods.mods:
            mod.sequence.modifications = mods

    return mods, reassigned, ambiguous


def _get_phosphors(df, cursor, pd_version, name=None):
    df['Ambiguous'] = False
    psp_vals = []

    if pd_version[:2] in PD1_SUPPORTED_VERSIONS:
        fields = cursor.execute(
            '''
            SELECT
            CustomDataFields.FieldID,
            CustomDataFields.DisplayName

            FROM CustomDataFields
            ''',
        )
        field_ids = [
            field_id
            for field_id, name in fields
            if name in ['phosphoRS Site Probabilities']
        ]

        if not field_ids:
            return df

        psp_vals = cursor.execute(
            '''
            SELECT
            CustomDataPeptides.PeptideID,
            CustomDataPeptides.FieldValue

            FROM CustomDataPeptides

            WHERE CustomDataPeptides.FieldID IN ({})
            '''.format(
                ', '.join('?' * len(field_ids))
            ),
            field_ids,
        )
    elif pd_version[:2] in PD2_SUPPORTED_VERSIONS:
        for ptmrs_col in [
            'ptmRSPhosphoSiteProbabilities',
            'ptmRSPhosphorylationSiteProbabilities',
        ]:
            try:
                psp_vals = cursor.execute(
                    '''
                    SELECT
                    TargetPsms.PeptideID,
                    TargetPsms.{}

                    FROM TargetPsms
                    '''.format(ptmrs_col),
                )
            except sqlite3.OperationalError:
                pass
            else:
                break
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    changed_peptides = 0

    for pep_id, psp_val in psp_vals:
        if pep_id not in df.index:
            continue

        old_mods = df.loc[pep_id]['Modifications']

        new_mods, reassigned, ambiguous = _reassign_mods(old_mods, psp_val)

        if reassigned:
            changed_peptides += 1

        df.at[pep_id, 'Modifications'] = new_mods
        df.at[pep_id, 'Ambiguous'] = ambiguous

    LOGGER.info(
        '{}: -- Reassigned {} phosphosites using phosphoRS'.format(
            name,
            changed_peptides,
        )
    )

    return df


def _get_species(cursor, pd_version):
    if pd_version[:2] in PD1_SUPPORTED_VERSIONS:
        fields = cursor.execute(
            '''
            SELECT
            ProcessingNodeParameters.ParameterValue

            FROM ProcessingNodeParameters

            WHERE ProcessingNodeParameters.ParameterName='Taxonomy'
            ''',
        )
    elif pd_version[:2] in PD2_SUPPORTED_VERSIONS:
        fields = cursor.execute(
            '''
            SELECT
            ProcessingNodeCustomData.CustomValue

            FROM ProcessingNodeCustomData

            WHERE ProcessingNodeCustomData.Name='FASTA database information'
            ''',
        )
        fields = [
            (line[len('Taxonomy:'):].strip(),)
            for i in fields
            for line in i[0].split('\n')
            if line.startswith('Taxonomy:')
        ]
    else:
        raise Exception(
            'Unsupported Proteome Discoverer Version: {}'.format(pd_version)
        )

    species = set()

    for name, in fields:
        # '. . . . . . . . . . . . . . . . Homo sapiens (human)'
        # => 'Homo sapiens'
        species.add(' '.join(name.strip('. ').split(' ')[:2]))

    return species


[docs]def read_discoverer_msf(basename, msf_path=None, pick_best_psm=False):
    '''
    Read a Proteome Discoverer .msf file.

    Converts file contents into a pandas DataFrame similar to what one would
    get by exporting peptides to .txt directly from Discoverer.

    Parameters
    ----------
    path : str
    pick_best_psm : bool, optional

    Returns
    -------
    df : :class:`pandas.DataFrame`
    '''
    if msf_path is None:
        msf_path = os.path.join(
            paths.MS_SEARCHED_DIR,
            basename,
        )

    if not os.path.exists(msf_path):
        raise Exception('Search database does not exist: {}'.format(msf_path))

    name = os.path.splitext(basename)[0]

    LOGGER.info(
        '{}: Loading ProteomeDiscoverer peptides...'.format(name)
    )

    with sqlite3.connect(msf_path) as conn:
        cursor = conn.cursor()

        pd_version = _get_pd_version(cursor)

        _update_label_names(cursor, pd_version)

        # Get any N-terminal quantification tags
        tag_names = _get_quant_tags(cursor, pd_version)

        # Read the main peptide properties
        df = _read_peptides(conn, pd_version, pick_best_psm=pick_best_psm)

        LOGGER.info(
            '{}: -- Loading information for {} peptides'.format(
                name,
                df.shape[0],
            )
        )

        df = _get_proteins(df, cursor, pd_version)
        df = _extract_sequence(df)
        df = _extract_confidence(df)
        df = _extract_spectrum_file(df)
        df = _get_modifications(df, cursor, pd_version)
        df = _get_phosphors(df, cursor, pd_version, name=name)
        df = _get_q_values(df, cursor, pd_version)
        df = _get_ms_data(df, cursor, pd_version)
        df = _get_filenames(df, cursor, pd_version)
        df = _get_quantifications(df, cursor, pd_version, tag_names)

        df = _set_defaults(df)

        species = _get_species(cursor, pd_version)

    df['Scan Paths'] = basename

    df.reset_index(inplace=True, drop=True)

    LOGGER.info(
        '{}: Loaded {} peptides'
        .format(
            os.path.splitext(basename)[0],
            df.shape[0],
        )
    )

    return df, species

__all__ = [
    'read_discoverer_msf',
]