Source code for conllu_tools.utils.xpos.format_xpos
"""Convert various XPOS formats to Perseus XPOS format."""
import regex as re
from conllu_tools.constants import (
ITTB_XPOS_MATCHER,
LLCT_XPOS_MATCHER,
PERSEUS_XPOS_MATCHER,
PROIEL_XPOS_MATCHER,
)
from conllu_tools.utils.features import feature_string_to_dict
from conllu_tools.utils.upos import upos_to_perseus
from .ittb_converters import ittb_to_perseus
from .llct_converters import llct_to_perseus
from .proiel_converters import proiel_to_perseus
[docs]
def format_xpos(upos: str, xpos: str | None, feats: dict[str, str] | str | None) -> str:
"""Convert morphology data in various formats to Perseus XPOS.
Arguments:
upos: The Universal Part of Speech tag.
xpos: XPOS string formatted in almost styles (LLCT, ITTB, PROIEL, Perseus, DALME, etc).
feats: A dictionary of features.
Returns:
A Perseus XPOS string.
"""
if upos is None:
msg = 'UPOS must be provided to format XPOS.'
raise ValueError(msg)
if not feats:
feats = {}
if isinstance(feats, str):
feats = feature_string_to_dict(feats)
if xpos is not None:
if re.match(PERSEUS_XPOS_MATCHER, xpos):
return f'{upos_to_perseus(upos)}{xpos[1:]}' # ensure UPOS is correct
if re.match(LLCT_XPOS_MATCHER, xpos):
return llct_to_perseus(upos, xpos, feats)
if re.match(ITTB_XPOS_MATCHER, xpos):
return ittb_to_perseus(upos, xpos)
if re.match(PROIEL_XPOS_MATCHER, xpos):
return proiel_to_perseus(upos, feats)
return f'{upos_to_perseus(upos)}--------' # default fallback