"""
Parse the OBO flat-file
"""
import os
from typing import Callable, Dict, Iterator, List
from pyhpo.config import TRUTH
FILENAME = "hp.obo"
class Metadata:
format_version: str
data_version: str
header: List[str] = []
@classmethod
def add_header_row(cls, row: str) -> None:
cls.header.append(row)
class Converter:
key_conversion: Dict[str, str] = {"def": "definition"}
type_conversions: Dict[str, Callable] = {}
@classmethod
def add_type_conversion(cls, key: str, func: Callable) -> None:
cls.type_conversions[key] = func
@staticmethod
def array_to_str(value: List[str], key: str, values: List[List[str]]) -> str:
if len(value):
return value[0]
return ""
@staticmethod
def array_to_bool(value: List[str], key: str, values: List[List[str]]) -> bool:
if not len(value):
return False
return value[0].lower() in TRUTH
@staticmethod
def parse_synonym(value: List[str], key: str, values: List[List[str]]) -> List[str]:
"""
Extracts the synonym from the synonym data line in the obo file format
Parameters
----------
synonym: str
value part of synonym-data line of obo file
e.g: "Multicystic dysplastic kidney" EXACT []
Returns
-------
str
Actual synonym title
e.g.: Multicystic dysplastic kidney
"""
return [x.split('"')[1] for x in value]
Converter.add_type_conversion("id", Converter.array_to_str)
Converter.add_type_conversion("name", Converter.array_to_str)
Converter.add_type_conversion("comment", Converter.array_to_str)
Converter.add_type_conversion("definition", Converter.array_to_str)
Converter.add_type_conversion("is_obsolete", Converter.array_to_bool)
Converter.add_type_conversion("replaced_by", Converter.array_to_str)
Converter.add_type_conversion("synonym", Converter.parse_synonym)
[docs]
def terms_from_file(data_folder: str) -> Iterator[dict]:
"""
Reads an obo file line by line to yield
a dict for building an HPOTerm
Parameters
----------
data_folder:
Full path to ``obo`` file
"""
filename = os.path.join(data_folder, FILENAME)
with open(filename) as fh:
# everything above the first [Term] is header
# and thus must not be parsed as term
for line in fh:
line = line.strip()
if line == "[Term]":
break
else:
Metadata.add_header_row(line)
term_section: List[str] = []
for line in fh:
line = line.strip()
if line == "[Term]":
yield parse_obo_section(term_section)
term_section = []
elif line == "[Typedef]":
# we're currently not parsing an Typedef section.
# Since they only appear at the end of the OBO file
# we're stopping the parsing here.
# TODO: Instead of break, add logic to skip all Typedef
# sections and continue with term parsing
break
else:
term_section.append(line)
yield parse_obo_section(term_section)
[docs]
def parse_obo_section(term_section: List[str]) -> dict:
"""
Parses the section of an OBO file for one single HPO term
Parameters
----------
term_section:
Lines of the ``obo`` file that describe the HPO term
"""
term_data = {}
for line in term_section:
if line == "":
continue
key, value = line.split(":", 1)
if key not in term_data:
term_data[key] = [value.strip()]
else:
term_data[key].append(value.strip())
term_dict = _convert_dict_keys(term_data)
term_dict = _convert_value_types(term_data)
return term_dict
def _convert_dict_keys(term_data: dict) -> dict:
"""
The HPO obo flat file contains some unfortunate attribute names.
This function will convert them into the actual attributes
for ``HPOTerm``
"""
for old, new in Converter.key_conversion.items():
term_data[new] = term_data.pop(old, [])
return term_data
def _convert_value_types(term_data: dict) -> dict:
for key, convert in Converter.type_conversions.items():
term_data[key] = convert(term_data.get(key, []), key, term_data)
return term_data