Source code for pyhpo.parser.obo

"""
Parse the OBO flat-file
"""

import os
from typing import Callable, Dict, Iterator, List

from pyhpo.config import TRUTH


FILENAME = "hp.obo"


class Metadata:
    format_version: str
    data_version: str
    header: List[str] = []

    @classmethod
    def add_header_row(cls, row: str) -> None:
        cls.header.append(row)


class Converter:
    key_conversion: Dict[str, str] = {"def": "definition"}

    type_conversions: Dict[str, Callable] = {}

    @classmethod
    def add_type_conversion(cls, key: str, func: Callable) -> None:
        cls.type_conversions[key] = func

    @staticmethod
    def array_to_str(value: List[str], key: str, values: List[List[str]]) -> str:
        if len(value):
            return value[0]
        return ""

    @staticmethod
    def array_to_bool(value: List[str], key: str, values: List[List[str]]) -> bool:
        if not len(value):
            return False
        return value[0].lower() in TRUTH

    @staticmethod
    def parse_synonym(value: List[str], key: str, values: List[List[str]]) -> List[str]:
        """
        Extracts the synonym from the synonym data line in the obo file format

        Parameters
        ----------
        synonym: str
            value part of synonym-data line of obo file

            e.g: "Multicystic dysplastic kidney" EXACT []

        Returns
        -------
        str
            Actual synonym title

            e.g.: Multicystic dysplastic kidney
        """
        return [x.split('"')[1] for x in value]


Converter.add_type_conversion("id", Converter.array_to_str)
Converter.add_type_conversion("name", Converter.array_to_str)
Converter.add_type_conversion("comment", Converter.array_to_str)
Converter.add_type_conversion("definition", Converter.array_to_str)
Converter.add_type_conversion("is_obsolete", Converter.array_to_bool)
Converter.add_type_conversion("replaced_by", Converter.array_to_str)
Converter.add_type_conversion("synonym", Converter.parse_synonym)



[docs]
def terms_from_file(data_folder: str) -> Iterator[dict]:
    """
    Reads an obo file line by line to yield
    a dict for building an HPOTerm

    Parameters
    ----------
    data_folder:
        Full path to ``obo`` file

    """
    filename = os.path.join(data_folder, FILENAME)

    with open(filename) as fh:
        # everything above the first [Term] is header
        # and thus must not be parsed as term
        for line in fh:
            line = line.strip()
            if line == "[Term]":
                break
            else:
                Metadata.add_header_row(line)

        term_section: List[str] = []
        for line in fh:
            line = line.strip()
            if line == "[Term]":
                yield parse_obo_section(term_section)
                term_section = []
            elif line == "[Typedef]":
                # we're currently not parsing an Typedef section.
                # Since they only appear at the end of the OBO file
                # we're stopping the parsing here.
                # TODO: Instead of break, add logic to skip all Typedef
                # sections and continue with term parsing
                break
            else:
                term_section.append(line)

        yield parse_obo_section(term_section)




[docs]
def parse_obo_section(term_section: List[str]) -> dict:
    """
    Parses the section of an OBO file for one single HPO term

    Parameters
    ----------
    term_section:
        Lines of the ``obo`` file that describe the HPO term
    """
    term_data = {}
    for line in term_section:
        if line == "":
            continue
        key, value = line.split(":", 1)
        if key not in term_data:
            term_data[key] = [value.strip()]
        else:
            term_data[key].append(value.strip())
    term_dict = _convert_dict_keys(term_data)
    term_dict = _convert_value_types(term_data)
    return term_dict



def _convert_dict_keys(term_data: dict) -> dict:
    """
    The HPO obo flat file contains some unfortunate attribute names.
    This function will convert them into the actual attributes
    for ``HPOTerm``
    """

    for old, new in Converter.key_conversion.items():
        term_data[new] = term_data.pop(old, [])

    return term_data


def _convert_value_types(term_data: dict) -> dict:
    for key, convert in Converter.type_conversions.items():
        term_data[key] = convert(term_data.get(key, []), key, term_data)

    return term_data