Utilities Examples
Examples for working with feature strings, XPOS tags, and format conversion utilities.
Convert Features to XPOS
Complete Workflow
from conllu_tools.utils import features_to_xpos, feature_string_to_dict, validate_xpos
# Start with features only
feats_str = 'Case=Gen|Gender=Fem|Number=Plur'
feats_dict = feature_string_to_dict(feats_str)
# Generate XPOS from features
xpos = features_to_xpos(feats_dict)
print(f"Generated XPOS: {xpos}") # '-p---fg-'
# Validate for specific UPOS
validated_xpos = validate_xpos('NOUN', xpos)
print(f"Validated XPOS: {validated_xpos}") # 'n-p---fg-'
Filling Partial XPOS
from conllu_tools.utils import features_to_xpos
# You have partial XPOS and want to fill gaps from features
provided_xpos = 'v--pi----' # Tense and Mood known
features = 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|Voice=Act'
# Generate from features
xpos_from_feats = features_to_xpos(features)
print(xpos_from_feats) # '-3spia---'
# Reconcile: provided positions take precedence, fill gaps
final_xpos = list(provided_xpos)
for i, char in enumerate(xpos_from_feats):
if final_xpos[i] == '-' and char != '-':
final_xpos[i] = char
print(''.join(final_xpos)) # 'v3spia---' (combined)
Format XPOS
Processing Mixed Formats
from conllu_tools.utils import format_xpos
# Process tokens from different source treebanks
tokens = [
{'upos': 'VERB', 'xpos': 'v|v|3|s|p|i|a|-|-|-', 'feats': 'Mood=Ind|...', 'source': 'LLCT'},
{'upos': 'NOUN', 'xpos': 'gen2|casA', 'feats': 'Case=Nom|...', 'source': 'ITTB'},
{'upos': 'ADJ', 'xpos': 'Nb', 'feats': 'Case=Acc|...', 'source': 'PROIEL'},
]
for token in tokens:
token['xpos_normalized'] = format_xpos(
token['upos'],
token['xpos'],
token['feats']
)
print(f"{token['source']}: {token['xpos']} → {token['xpos_normalized']}")
# Output:
# LLCT: v|v|3|s|p|i|a|-|-|- → v3spia---
# ITTB: gen2|casA → n-s---fn-
# PROIEL: Nb → n-s---na-
See Also
Utils for detailed documentation