Normalization Examples
Examples for normalizing morphological annotations in CoNLL-U files.
Basic Normalization
The following examples assume you have loaded a feature set:
from conllu_tools.io import load_language_data
from conllu_tools.utils import normalize_morphology
feature_set = load_language_data('feats', language='la')
Example 1: VERB with Gerund Features
xpos, feats = normalize_morphology(
upos='VERB',
xpos='v-s-ga-g-',
feats='Aspect=Perf|Case=Gen|Degree=Pos|Number=Sing|Voice=Act',
feature_set=feature_set,
ref_features='Aspect=Perf|Case=Gen|Degree=Pos|Number=Sing|VerbForm=Ger|Voice=Act'
)
print(xpos)
# Output: 'v-stga-g-' (position 3 filled from features)
print(feats)
# Output: {'Aspect': 'Perf', 'Case': 'Gen', 'Degree': 'Pos', 'Number': 'Sing', 'VerbForm': 'Ger', 'Voice': 'Act'}
Example 2: AUX with Finite Verb Features
xpos, feats = normalize_morphology(
upos='AUX',
xpos='v2spia---',
feats='Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|Voice=Act',
feature_set=feature_set
)
print(xpos)
# Output: 'v2spia---' (all positions validated)
print(feats)
# Output: {'Mood': 'Ind', 'Number': 'Sing', 'Person': 2', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}
Example 3: ADJ with Degree Reconciliation
xpos, feats = normalize_morphology(
upos='ADJ',
xpos='a-s---nbp',
feats='Case=Abl|Degree=Pos|Gender=Neut|Number=Sing',
feature_set=feature_set,
ref_features='Case=Abl|Gender=Masc|Number=Sing' # Gender conflicts - feats wins
)
print(xpos)
# Output: 'a-s---nbp' (validated)
print(feats)
# Output: {'Case': 'Abl', 'Degree': 'Pos', 'Gender': 'Neut', 'Number': 'Sing'}
# Note: Gender=Neut from feats takes precedence over Gender=Masc from ref_features
Example 4: NOUN with XPOS/UPOS Mismatch Correction
# XPOS suggests VERB, but UPOS is NOUN
xpos, feats = normalize_morphology(
upos='NOUN',
xpos='v2spma---', # Wrong UPOS character
feats='Mood=Imp|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|Voice=Act',
feature_set=feature_set
)
print(xpos)
# Output: 'n-s------' (corrected to NOUN format, invalid features removed)
print(feats)
# Output: {'Number': 'Sing'} (only Number is valid for NOUN)
Normalization Workflows
Normalizing a CoNLL-U File
Process an entire CoNLL-U file to normalize all morphological annotations:
from conllu_tools.io import load_language_data
from conllu_tools.utils import normalize_morphology, feature_dict_to_string
# Load feature set
feature_set = load_language_data('feats', language='la')
# Load CoNLL-U file (assuming you have a load function)
sentences = load_conllu('input.conllu')
# Normalize all annotations
for sentence in sentences:
for token in sentence:
# Skip multiword tokens and empty nodes
if '-' in str(token['id']) or '.' in str(token['id']):
continue
# Normalize morphology
if token['upos'] != '_' and token['xpos'] != '_':
xpos, feats = normalize_morphology(
upos=token['upos'],
xpos=token['xpos'],
feats=token['feats'] if token['feats'] != '_' else {},
feature_set=feature_set
)
# Update token
token['xpos'] = xpos
token['feats'] = feature_dict_to_string(feats) if feats else '_'
Cleaning Imported Annotations
Normalize annotations from external sources with ref_features:
from conllu_tools.io import load_language_data
from conllu_tools.utils import normalize_morphology, feature_dict_to_string
feature_set = load_language_data('feats', language='la')
def clean_annotation(token, feature_set, ref_token=None):
"""Clean a single token's annotation."""
if token['upos'] == '_':
return token
try:
# Get reference features if available
ref_features = None
if ref_token and ref_token['feats'] != '_':
ref_features = ref_token['feats']
# Normalize
xpos, feats = normalize_morphology(
upos=token['upos'],
xpos=token['xpos'] if token['xpos'] != '_' else f"{token['upos'][0].lower()}--------",
feats=token['feats'] if token['feats'] != '_' else {},
feature_set=feature_set,
ref_features=ref_features
)
token['xpos'] = xpos
token['feats'] = feature_dict_to_string(feats) if feats else '_'
except Exception as e:
print(f"Warning: Could not normalize token {token['form']}: {e}")
token['xpos'] = f"{token['upos'][0].lower()}--------"
token['feats'] = '_'
return token
# Apply to all tokens with reference
for sentence, ref_sentence in zip(imported_sentences, reference_sentences):
for token, ref_token in zip(sentence, ref_sentence):
if isinstance(token['id'], int):
clean_annotation(token, feature_set, ref_token)
Validating After Normalization
Always validate after normalizing to ensure correctness:
from conllu_tools.io import load_language_data
from conllu_tools.validation import ConlluValidator
from conllu_tools.utils import normalize_morphology
# Normalize
feature_set = load_language_data('feats', language='la')
# ... normalization code ...
# Validate
validator = ConlluValidator(lang='la', level=2)
reporter = validator.validate_file('output_normalized.conllu')
if reporter.get_error_count() > 0:
print(f"Found {reporter.get_error_count()} errors after normalization")
for error in reporter.format_errors():
print(error)
else:
print("All annotations normalized successfully!")
Batch Normalization
Normalize multiple files in batch:
from pathlib import Path
from conllu_tools.io import load_language_data
from conllu_tools.utils import normalize_morphology, feature_dict_to_string
def normalize_directory(input_dir, output_dir):
"""Normalize all CoNLL-U files in a directory."""
feature_set = load_language_data('feats', language='la')
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
for conllu_file in input_path.glob('*.conllu'):
print(f"Processing {conllu_file.name}...")
sentences = load_conllu(str(conllu_file))
# Normalize each sentence
for sentence in sentences:
for token in sentence:
if isinstance(token['id'], int) and token['upos'] != '_':
try:
xpos, feats = normalize_morphology(
upos=token['upos'],
xpos=token['xpos'] if token['xpos'] != '_' else f"{token['upos'][0].lower()}--------",
feats=token['feats'] if token['feats'] != '_' else {},
feature_set=feature_set
)
token['xpos'] = xpos
token['feats'] = feature_dict_to_string(feats) if feats else '_'
except Exception as e:
print(f" Error in {token['form']}: {e}")
# Save normalized file
output_file = output_path / conllu_file.name
save_conllu(sentences, str(output_file))
print(f"Saved to {output_file}")
# Process all files
normalize_directory('raw_annotations/', 'normalized_annotations/')
Error Handling
Handling Normalization Errors
from conllu_tools.utils import normalize_morphology, validate_features, validate_xpos
# Validation errors
try:
result = validate_xpos(None, 'n-s---mn-')
except ValueError as e:
print(f"Error: {e}")
# Output: Error: UPOS must be provided to validate XPOS
try:
result = validate_features(None, 'Case=Nom', feature_set)
except ValueError as e:
print(f"Error: {e}")
# Output: Error: UPOS and feature set must be provided to validate FEATS
# Format detection errors
try:
xpos, feats = normalize_morphology('NOUN', None, {}, feature_set)
except ValueError as e:
print(f"Error: {e}")
# Output: Error: Both UPOS and FEATS must be provided to format XPOS
Safe Normalization with Fallbacks
from conllu_tools.utils import normalize_morphology, feature_dict_to_string
def safe_normalize_morphology(upos, xpos, feats, feature_set, ref_features=None):
"""Normalize morphology with fallback."""
try:
return normalize_morphology(upos, xpos, feats, feature_set, ref_features)
except Exception as e:
print(f"Warning: Could not normalize for UPOS '{upos}': {e}")
# Return safe defaults
default_xpos = f"{upos[0].lower()}--------" if upos else '---------'
return default_xpos, {}
# Use safe version in production
for token in sentence:
if token['upos'] != '_':
xpos, feats = safe_normalize_morphology(
token['upos'],
token['xpos'],
token['feats'],
feature_set
)
token['xpos'] = xpos
token['feats'] = feature_dict_to_string(feats) if feats else '_'
See Also
Utilities User Guide for detailed documentation