From e832efd97435b7dc7acc982f5a5aa4940c40306e Mon Sep 17 00:00:00 2001 From: karamouche Date: Wed, 27 May 2026 09:16:00 -0400 Subject: [PATCH 1/3] feat: add Danish language support with number normalization and operators --- README.md | 6 +- docs/contributing-guide.md | 2 + normalization/languages/__init__.py | 3 +- normalization/languages/danish/__init__.py | 7 + .../languages/danish/number_normalizer.py | 435 ++++++++++++++++++ normalization/languages/danish/operators.py | 106 +++++ .../languages/danish/replacements.py | 6 + tests/e2e/files/gladia-3/da.csv | 14 + .../danish_number_normalizer_test.py | 113 +++++ tests/unit/languages/danish_operators_test.py | 27 ++ 10 files changed, 716 insertions(+), 3 deletions(-) create mode 100644 normalization/languages/danish/__init__.py create mode 100644 normalization/languages/danish/number_normalizer.py create mode 100644 normalization/languages/danish/operators.py create mode 100644 normalization/languages/danish/replacements.py create mode 100644 tests/e2e/files/gladia-3/da.csv create mode 100644 tests/unit/languages/danish_number_normalizer_test.py create mode 100644 tests/unit/languages/danish_operators_test.py diff --git a/README.md b/README.md index 724e1fd..c8874bf 100644 --- a/README.md +++ b/README.md @@ -112,11 +112,13 @@ Pipelines are defined declaratively in **YAML presets**. Each preset lists the s | Code | Language | | ---- | -------- | +| `da` | Danish | +| `de` | German | | `en` | English | +| `es` | Spanish | +| `fi` | Finnish | | `fr` | French | -| `de` | German | | `it` | Italian | -| `es` | Spanish | | `nl` | Dutch | | `sv` | Swedish | diff --git a/docs/contributing-guide.md b/docs/contributing-guide.md index e271b87..e57407f 100644 --- a/docs/contributing-guide.md +++ b/docs/contributing-guide.md @@ -167,9 +167,11 @@ E2E tests validate the full pipeline (preset + language) against CSV fixtures. T tests/e2e/files/ gladia-3/ default.csv + da.csv de.csv en.csv es.csv + fi.csv fr.csv it.csv nl.csv diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py index bfa6a54..7739867 100644 --- a/normalization/languages/__init__.py +++ b/normalization/languages/__init__.py @@ -1,10 +1,11 @@ -from . import dutch, english, finnish, french, german, italian, spanish, swedish +from . import danish, dutch, english, finnish, french, german, italian, spanish, swedish from .base import LanguageOperators from .registry import get_language_registry, register_language register_language(LanguageOperators) __all__ = [ + "danish", "dutch", "english", "finnish", diff --git a/normalization/languages/danish/__init__.py b/normalization/languages/danish/__init__.py new file mode 100644 index 0000000..b2eee1b --- /dev/null +++ b/normalization/languages/danish/__init__.py @@ -0,0 +1,7 @@ +from .operators import DanishOperators +from .replacements import DANISH_REPLACEMENTS + +__all__ = [ + "DanishOperators", + "DANISH_REPLACEMENTS", +] diff --git a/normalization/languages/danish/number_normalizer.py b/normalization/languages/danish/number_normalizer.py new file mode 100644 index 0000000..9117417 --- /dev/null +++ b/normalization/languages/danish/number_normalizer.py @@ -0,0 +1,435 @@ +"""Danish number normalizer (STT-oriented). + +``text2num.alpha2digit`` does not support Danish, so this module implements +spelled-out cardinal parsing for common transcript patterns: 0–999, ``tusind`` +compounds, and large multipliers (``million``, ``milliard``, ``billion``). +Danish uses the vigesimal (base-20) system for 50–90 and forms compound numbers +as ``"X og Y"`` (e.g. ``"en og tyve"`` = 21) or as glued words (``"enogtyve"``). +Optionally rewrites currency symbols then restores plural currency words from config. +""" + +from __future__ import annotations + +import re + + +def _fold(s: str) -> str: + return s.lower() + + +def _get(table: dict[str, int], word: str) -> int | None: + fw = _fold(word) + for k, v in table.items(): + if _fold(k) == fw: + return v + return None + + +_ONES_2_9: dict[str, int] = { + "to": 2, + "tre": 3, + "fire": 4, + "fem": 5, + "seks": 6, + "syv": 7, + "otte": 8, + "ni": 9, +} + +_TEENS: dict[str, int] = { + "ti": 10, + "elleve": 11, + "tolv": 12, + "tretten": 13, + "fjorten": 14, + "femten": 15, + "seksten": 16, + "sytten": 17, + "atten": 18, + "nitten": 19, +} + +_TENS: dict[str, int] = { + "tyve": 20, + "tredive": 30, + "tredve": 30, # colloquial spoken variant + "fyrre": 40, + "halvtreds": 50, + "tres": 60, + "halvfjerds": 70, + "firs": 80, + "halvfems": 90, +} + +# Used for og-compound ones part (includes en/et = 1 alongside 2-9) +_ONES_FOR_OG: dict[str, int] = {"en": 1, "et": 1, **_ONES_2_9} + +_DIGIT_TO_DANISH: dict[str, str] = { + "0": "nul", + "1": "en", + "2": "to", + "3": "tre", + "4": "fire", + "5": "fem", + "6": "seks", + "7": "syv", + "8": "otte", + "9": "ni", +} + +_RE_MIXED_NUMBER = re.compile( + r"\b(\d+)\s+(" + r"milliard|milliarder|million|millioner|billion|billioner|tusind" + r")\b", + re.IGNORECASE, +) + +_BIG_MULT: dict[str, int] = { + "tusind": 1_000, + "million": 1_000_000, + "millioner": 1_000_000, + "milliard": 1_000_000_000, + "milliarder": 1_000_000_000, + "billion": 1_000_000_000_000, + "billioner": 1_000_000_000_000, +} + +# Sorted longest-first so "otte" is tried before "et" etc. in glued-compound detection. +_ONES_FOR_OG_SORTED: tuple[tuple[str, int], ...] = tuple( + sorted(_ONES_FOR_OG.items(), key=lambda kv: len(kv[0]), reverse=True) +) + + +def _try_parse_og_compound(word: str) -> int | None: + """Parse a glued Danish ones-og-tens compound like ``'enogtyve'`` = 21.""" + fw = _fold(word) + for ones_str, ones_val in _ONES_FOR_OG_SORTED: + prefix = ones_str + "og" + if fw.startswith(prefix): + rest = fw[len(prefix) :] + tens_val = _TENS.get(rest) + if tens_val is not None: + return tens_val + ones_val + return None + + +def _normalize_mixed_numbers(text: str) -> str: + """Convert ``3 milliard`` → ``tre milliard`` so the word parser yields 3 000 000 000.""" + + def replace(match: re.Match[str]) -> str: + number = match.group(1) + multiplier = match.group(2) + if len(number) == 1 and number in _DIGIT_TO_DANISH: + return f"{_DIGIT_TO_DANISH[number]} {multiplier}" + return match.group(0) + + return _RE_MIXED_NUMBER.sub(replace, text) + + +def _singular_spoken_unit(trailing_word: str) -> str: + t = trailing_word.lower() + if t == "euros": + return "euro" + if t == "dollars": + return "dollar" + if t == "pund": + return "pund" # invariant in Danish (same singular and plural) + if t == "cent": + return "cent" + if t == "yen": + return "yen" + if t == "kroner": + return "krone" + return trailing_word + + +def _normalize_currency_symbols( + text: str, + currency_symbol_to_word: dict[str, str] | None, +) -> str: + if not currency_symbol_to_word: + return text + num = r"\d+(?:[.,]\d+)?" + for symbol, trailing in currency_symbol_to_word.items(): + singular = _singular_spoken_unit(trailing) + esc = re.escape(symbol) + sym = rf"\b{esc}\b" if len(symbol) > 1 else esc + text = re.sub(rf"{sym}\s*({num})", rf"\1 {singular}", text, flags=re.IGNORECASE) + text = re.sub(rf"({num})\s*{sym}", rf"\1 {singular}", text, flags=re.IGNORECASE) + return text + + +def _currency_plural_fix_patterns( + currency_symbol_to_word: dict[str, str] | None, +) -> tuple[tuple[re.Pattern[str], str], ...]: + if not currency_symbol_to_word: + return () + amount = r"(\d+(?:[.,]\d+)?)" + seen: set[str] = set() + out: list[tuple[re.Pattern[str], str]] = [] + for _symbol, trailing in currency_symbol_to_word.items(): + tl = trailing.lower() + if tl in seen: + continue + seen.add(tl) + singular = _singular_spoken_unit(trailing) + if singular.lower() == tl: + continue + if tl == "euros": + pat = re.compile(rf"\b{amount}\s+euro(?:'s)?\b", re.IGNORECASE) + out.append((pat, rf"\1 {trailing}")) + elif tl == "kroner": + pat = re.compile(rf"\b{amount}\s+krone\b", re.IGNORECASE) + out.append((pat, rf"\1 {trailing}")) + else: + pat = re.compile( + rf"\b{amount}\s+{re.escape(singular)}\b", + re.IGNORECASE, + ) + out.append((pat, rf"\1 {trailing}")) + return tuple(out) + + +def _apply_currency_plural_fixes( + text: str, + fixers: tuple[tuple[re.Pattern[str], str], ...], +) -> str: + for pattern, repl in fixers: + text = pattern.sub(repl, text) + return text + + +def _hundred_multiplier(word: str) -> int | None: + if _fold(word) in ("en", "et"): + return 1 + return _get(_ONES_2_9, word) + + +class DanishNumberNormalizer: + """Convert Danish spelled-out numbers to digits.""" + + def __init__(self, currency_symbol_to_word: dict[str, str] | None = None) -> None: + self._currency_symbol_to_word = currency_symbol_to_word + self._currency_plural_fixes = _currency_plural_fix_patterns( + currency_symbol_to_word + ) + + def __call__(self, text: str) -> str: + if not text.strip(): + return text + text = _normalize_currency_symbols(text, self._currency_symbol_to_word) + text = _normalize_mixed_numbers(text) + words = text.split() + out: list[str] = [] + i = 0 + n = len(words) + while i < n: + parsed = self._parse_number(words, i, n) + if parsed is not None: + end, value = parsed + out.append(str(value)) + i = end + else: + out.append(words[i]) + i += 1 + text = " ".join(out) + text = _apply_currency_plural_fixes(text, self._currency_plural_fixes) + return text + + def _parse_number(self, words: list[str], i: int, n: int) -> tuple[int, int] | None: + if i >= n: + return None + + fw = _fold(words[i]) + + if fw == "tusind": + tail = self._parse_number(words, i + 1, n) + if tail is not None: + end, v2 = tail + return end, 1000 + v2 + return i + 1, 1000 + + if i + 1 < n and fw in ("en", "et") and _fold(words[i + 1]) == "tusind": + tail = self._parse_number(words, i + 2, n) + base = 1000 + if tail is not None: + end, v2 = tail + return end, base + v2 + return i + 2, base + + if ( + i + 1 < n + and fw in ("en", "et") + and _fold(words[i + 1]) + in ( + "million", + "millioner", + ) + ): + tail = self._parse_number(words, i + 2, n) + base = 1_000_000 + if tail is not None: + end, v2 = tail + return end, base + v2 + return i + 2, base + + if ( + i + 1 < n + and fw in ("en", "et") + and _fold(words[i + 1]) + in ( + "milliard", + "milliarder", + ) + ): + tail = self._parse_number(words, i + 2, n) + base = 1_000_000_000 + if tail is not None: + end, v2 = tail + return end, base + v2 + return i + 2, base + + if ( + i + 1 < n + and fw in ("en", "et") + and _fold(words[i + 1]) + in ( + "billion", + "billioner", + ) + ): + tail = self._parse_number(words, i + 2, n) + base = 1_000_000_000_000 + if tail is not None: + end, v2 = tail + return end, base + v2 + return i + 2, base + + sub999 = self._parse_0_999(words, i, n) + if sub999 is None: + return None + j, v = sub999 + if j >= n: + return j, v + + next_fw = _fold(words[j]) + if next_fw == "tusind": + j += 1 + prod = v * 1000 + if j >= n: + return j, prod + tail = self._parse_number(words, j, n) + if tail is not None: + end, v2 = tail + return end, prod + v2 + return j, prod + + mult = _BIG_MULT.get(next_fw) + if mult is not None and mult >= 1_000_000: + j += 1 + prod = v * mult + if j >= n: + return j, prod + tail = self._parse_number(words, j, n) + if tail is not None: + end, v2 = tail + return end, prod + v2 + return j, prod + + return j, v + + def _parse_0_999(self, words: list[str], i: int, n: int) -> tuple[int, int] | None: + if i >= n: + return None + + fw = _fold(words[i]) + + if fw == "nul": + if i + 1 < n and self._continues_number(words[i + 1]): + return None + return i + 1, 0 + + if fw == "hundrede": + j = i + 1 + if j < n and _fold(words[j]) == "og": + j += 1 + tail = self._parse_0_99_after_og(words, j, n) + if tail is not None: + je, tv = tail + return je, 100 + tv + return i + 1, 100 + + if i + 1 < n and _fold(words[i + 1]) == "hundrede": + m = _hundred_multiplier(words[i]) + if m is None: + return None + base = m * 100 + j = i + 2 + if j < n and _fold(words[j]) == "og": + j += 1 + tail = self._parse_0_99_after_og(words, j, n) + if tail is not None: + je, tv = tail + return je, base + tv + return j, base + + return self._parse_0_99(words, i, n) + + def _continues_number(self, word: str) -> bool: + fw = _fold(word) + if fw in ("hundrede", "tusind"): + return True + if fw in _BIG_MULT: + return True + if _get(_TEENS, word) is not None: + return True + if _get(_TENS, word) is not None: + return True + if _get(_ONES_2_9, word) is not None: + return True + if fw in ("en", "et"): + return True + return False + + def _parse_0_99_after_og( + self, words: list[str], i: int, n: int + ) -> tuple[int, int] | None: + """Parse 0–99, additionally accepting standalone ``en``/``et`` as 1 after ``og``.""" + result = self._parse_0_99(words, i, n) + if result is not None: + return result + if i < n and _fold(words[i]) in ("en", "et"): + return i + 1, 1 + return None + + def _parse_0_99(self, words: list[str], i: int, n: int) -> tuple[int, int] | None: + if i >= n: + return None + + fw = _fold(words[i]) + + compound = _try_parse_og_compound(words[i]) + if compound is not None: + return i + 1, compound + + v = _get(_TEENS, words[i]) + if v is not None: + return i + 1, v + + tens = _get(_TENS, words[i]) + if tens is not None: + return i + 1, tens + + # Three-word og-compound: "en og tyve" = 21 + ones_val = _ONES_FOR_OG.get(fw) + if ones_val is not None and i + 2 < n and _fold(words[i + 1]) == "og": + tens_val = _get(_TENS, words[i + 2]) + if tens_val is not None: + return i + 3, tens_val + ones_val + + o = _get(_ONES_2_9, words[i]) + if o is not None: + return i + 1, o + + # Standalone "en"/"et" → do not convert (would be article in normal prose) + return None diff --git a/normalization/languages/danish/operators.py b/normalization/languages/danish/operators.py new file mode 100644 index 0000000..7353a7c --- /dev/null +++ b/normalization/languages/danish/operators.py @@ -0,0 +1,106 @@ +from normalization.languages.base import LanguageConfig, LanguageOperators +from normalization.languages.danish.number_normalizer import DanishNumberNormalizer +from normalization.languages.registry import register_language + +_DANISH_DIGIT_WORDS: dict[str, str] = { + "nul": "0", + "en": "1", + "et": "1", + "to": "2", + "tre": "3", + "fire": "4", + "fem": "5", + "seks": "6", + "syv": "7", + "otte": "8", + "ni": "9", +} + +DANISH_CONFIG = LanguageConfig( + code="da", + decimal_separator=",", + decimal_word="komma", + thousand_separator=".", + symbols_to_words={ + "@": "snabel a", + ".": "punktum", + "+": "plus", + "=": "er lig med", + ">": "større end", + "<": "mindre end", + "°": "grader", + "°C": "grader celsius", + "°F": "grader fahrenheit", + "%": "procent", + }, + currency_symbol_to_word={ + "€": "euros", + "$": "dollars", + "£": "pund", + "¢": "cent", + "¥": "yen", + "kr": "kroner", + }, + filler_words=[ + # Post-diacritics forms: remove_diacritics runs before remove_filler_words, + # so ø→o and å→a conversions have already happened at match time. + "oh", # from "øh" + "ohm", # from "øhm" + "hm", + "hmm", + "mm", + "mhm", + "altsa", # from "altså" + "na", # from "nå" + ], + digit_words=_DANISH_DIGIT_WORDS, + number_words=[ + *_DANISH_DIGIT_WORDS, + "ti", + "elleve", + "tolv", + "tretten", + "fjorten", + "femten", + "seksten", + "sytten", + "atten", + "nitten", + "tyve", + "tredive", + "tredve", + "fyrre", + "halvtreds", + "tres", + "halvfjerds", + "firs", + "halvfems", + "hundrede", + "tusind", + "million", + "millioner", + "milliard", + "milliarder", + "billion", + "billioner", + ], + plus_word="plus", +) + + +@register_language +class DanishOperators(LanguageOperators): + def __init__(self) -> None: + super().__init__(DANISH_CONFIG) + self._number_normalizer = DanishNumberNormalizer( + DANISH_CONFIG.currency_symbol_to_word, + ) + + def expand_written_numbers(self, text: str) -> str: + """Convert Danish spelled-out numbers to digits (e.g. en og tyve → 21).""" + return self._number_normalizer(text) + + def get_word_replacements(self) -> dict[str, str]: + from normalization.languages.danish.replacements import DANISH_REPLACEMENTS + + return DANISH_REPLACEMENTS diff --git a/normalization/languages/danish/replacements.py b/normalization/languages/danish/replacements.py new file mode 100644 index 0000000..7536f2b --- /dev/null +++ b/normalization/languages/danish/replacements.py @@ -0,0 +1,6 @@ +"""Single-token colloquial / spelling variants → standard Danish (canonical for WER).""" + +DANISH_REPLACEMENTS: dict[str, str] = { + "euro": "euros", + "krone": "kroner", +} diff --git a/tests/e2e/files/gladia-3/da.csv b/tests/e2e/files/gladia-3/da.csv new file mode 100644 index 0000000..c334516 --- /dev/null +++ b/tests/e2e/files/gladia-3/da.csv @@ -0,0 +1,14 @@ +input,expected +ti euro,10 euros +2 < 5,2 mindre end 5 +50°C,50 grader celsius +Det koster €50,det koster 50 euros +en og tyve kroner,21 kroner +10 kr,10 kroner +øh hej,hej +test@example.com,test snabel a example punktum com +x = 5,x er lig med 5 +"1.234,56",1234 komma 56 +"3,14",3 komma 14 +en million,1000000 +ping pong,ping pong diff --git a/tests/unit/languages/danish_number_normalizer_test.py b/tests/unit/languages/danish_number_normalizer_test.py new file mode 100644 index 0000000..b8de6c6 --- /dev/null +++ b/tests/unit/languages/danish_number_normalizer_test.py @@ -0,0 +1,113 @@ +import pytest + +from normalization.languages.danish.number_normalizer import DanishNumberNormalizer +from normalization.languages.danish.operators import DANISH_CONFIG + + +@pytest.fixture +def normalizer() -> DanishNumberNormalizer: + return DanishNumberNormalizer(DANISH_CONFIG.currency_symbol_to_word) + + +@pytest.fixture +def normalizer_no_currency() -> DanishNumberNormalizer: + return DanishNumberNormalizer(None) + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + # Basic digits and teens + ("ni", "9"), + ("elleve", "11"), + ("femten", "15"), + ("nitten", "19"), + # Vigesimal tens (unique to Danish) + ("tyve", "20"), + ("tredive", "30"), + ("tredve", "30"), # colloquial variant + ("fyrre", "40"), + ("halvtreds", "50"), + ("tres", "60"), + ("halvfjerds", "70"), + ("firs", "80"), + ("halvfems", "90"), + # Three-word og-compounds + ("en og tyve", "21"), + ("to og tyve", "22"), + ("tre og halvtreds", "53"), + ("ni og halvfems", "99"), + ("fem og firs", "85"), + # Glued og-compounds + ("enogtyve", "21"), + ("toogtyve", "22"), + ("fireogfirs", "84"), + ("nioghalvfems", "99"), + ("treoghalvfjerds", "73"), + # Hundreds + ("hundrede", "100"), + ("to hundrede", "200"), + ("ni hundrede", "900"), + ("to hundrede og en", "201"), + ("tre hundrede og femten", "315"), + ("to hundrede og en og tyve", "221"), + ("to hundrede og fireoghalvtreds", "254"), + # Tusind + ("tusind", "1000"), + ("et tusind", "1000"), + ("to tusind", "2000"), + ("to tusind tre hundrede", "2300"), + ("tres tusind", "60000"), + # Large multipliers + ("en million", "1000000"), + ("to millioner", "2000000"), + ("en milliard", "1000000000"), + ("to milliarder", "2000000000"), + ("en billion", "1000000000000"), + # Mixed digit + word + ("3 milliard", "3000000000"), + ("5 million", "5000000"), + # Zero + ("nul", "0"), + ], +) +def test_danish_spelled_numbers( + normalizer: DanishNumberNormalizer, text: str, expected: str +) -> None: + assert normalizer(text) == expected + + +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("ti euro", "10 euros"), + ("hundrede kroner", "100 kroner"), + ("€10", "10 euros"), + ("10 kr", "10 kroner"), + ("£50", "50 pund"), + ("fem dollar", "5 dollars"), + ], +) +def test_currency_symbols_and_plural_trailing_words( + normalizer: DanishNumberNormalizer, text: str, expected: str +) -> None: + assert normalizer(text) == expected + + +def test_without_currency_config_leaves_currency_symbol( + normalizer_no_currency: DanishNumberNormalizer, +) -> None: + assert normalizer_no_currency("en og tyve") == "21" + assert normalizer_no_currency("€10") == "€10" + assert normalizer_no_currency("3 milliard") == "3000000000" + + +def test_non_numeric_text_unchanged(normalizer: DanishNumberNormalizer) -> None: + text = "det her er almindelig tekst" + assert normalizer(text) == text + + +def test_kroner_word_not_treated_as_currency_suffix( + normalizer: DanishNumberNormalizer, +) -> None: + assert normalizer("25 kroner") == "25 kroner" diff --git a/tests/unit/languages/danish_operators_test.py b/tests/unit/languages/danish_operators_test.py new file mode 100644 index 0000000..d1e015b --- /dev/null +++ b/tests/unit/languages/danish_operators_test.py @@ -0,0 +1,27 @@ +import pytest + +from normalization.languages.danish.operators import DanishOperators +from normalization.languages.registry import get_language_registry + + +@pytest.fixture +def operators() -> DanishOperators: + return DanishOperators() + + +def test_danish_is_registered() -> None: + assert "da" in get_language_registry() + + +def test_danish_registry_produces_danish_operators() -> None: + instance = get_language_registry()["da"]() + assert isinstance(instance, DanishOperators) + + +def test_config_code(operators: DanishOperators) -> None: + assert operators.config.code == "da" + + +def test_word_replacements(operators: DanishOperators) -> None: + assert operators.get_word_replacements()["euro"] == "euros" + assert operators.get_word_replacements()["krone"] == "kroner" From 8aa482c88de3925decefacd1aef082f13151a0cb Mon Sep 17 00:00:00 2001 From: karamouche Date: Wed, 27 May 2026 15:39:28 -0400 Subject: [PATCH 2/3] fix: update Danish currency terms to singular form in normalizer and tests --- .../languages/danish/number_normalizer.py | 15 +-------------- normalization/languages/danish/operators.py | 4 ++-- normalization/languages/danish/replacements.py | 1 - tests/e2e/files/gladia-3/da.csv | 4 ++-- .../languages/danish_number_normalizer_test.py | 6 +++--- tests/unit/languages/danish_operators_test.py | 2 +- 6 files changed, 9 insertions(+), 23 deletions(-) diff --git a/normalization/languages/danish/number_normalizer.py b/normalization/languages/danish/number_normalizer.py index 9117417..c08325b 100644 --- a/normalization/languages/danish/number_normalizer.py +++ b/normalization/languages/danish/number_normalizer.py @@ -128,16 +128,6 @@ def replace(match: re.Match[str]) -> str: def _singular_spoken_unit(trailing_word: str) -> str: t = trailing_word.lower() - if t == "euros": - return "euro" - if t == "dollars": - return "dollar" - if t == "pund": - return "pund" # invariant in Danish (same singular and plural) - if t == "cent": - return "cent" - if t == "yen": - return "yen" if t == "kroner": return "krone" return trailing_word @@ -175,10 +165,7 @@ def _currency_plural_fix_patterns( singular = _singular_spoken_unit(trailing) if singular.lower() == tl: continue - if tl == "euros": - pat = re.compile(rf"\b{amount}\s+euro(?:'s)?\b", re.IGNORECASE) - out.append((pat, rf"\1 {trailing}")) - elif tl == "kroner": + if tl == "kroner": pat = re.compile(rf"\b{amount}\s+krone\b", re.IGNORECASE) out.append((pat, rf"\1 {trailing}")) else: diff --git a/normalization/languages/danish/operators.py b/normalization/languages/danish/operators.py index 7353a7c..f0f42d6 100644 --- a/normalization/languages/danish/operators.py +++ b/normalization/languages/danish/operators.py @@ -34,8 +34,8 @@ "%": "procent", }, currency_symbol_to_word={ - "€": "euros", - "$": "dollars", + "€": "euro", + "$": "dollar", "£": "pund", "¢": "cent", "¥": "yen", diff --git a/normalization/languages/danish/replacements.py b/normalization/languages/danish/replacements.py index 7536f2b..658f9bd 100644 --- a/normalization/languages/danish/replacements.py +++ b/normalization/languages/danish/replacements.py @@ -1,6 +1,5 @@ """Single-token colloquial / spelling variants → standard Danish (canonical for WER).""" DANISH_REPLACEMENTS: dict[str, str] = { - "euro": "euros", "krone": "kroner", } diff --git a/tests/e2e/files/gladia-3/da.csv b/tests/e2e/files/gladia-3/da.csv index c334516..de37d07 100644 --- a/tests/e2e/files/gladia-3/da.csv +++ b/tests/e2e/files/gladia-3/da.csv @@ -1,8 +1,8 @@ input,expected -ti euro,10 euros +ti euro,10 euro 2 < 5,2 mindre end 5 50°C,50 grader celsius -Det koster €50,det koster 50 euros +Det koster €50,det koster 50 euro en og tyve kroner,21 kroner 10 kr,10 kroner øh hej,hej diff --git a/tests/unit/languages/danish_number_normalizer_test.py b/tests/unit/languages/danish_number_normalizer_test.py index b8de6c6..845d9ed 100644 --- a/tests/unit/languages/danish_number_normalizer_test.py +++ b/tests/unit/languages/danish_number_normalizer_test.py @@ -80,12 +80,12 @@ def test_danish_spelled_numbers( @pytest.mark.parametrize( ("text", "expected"), [ - ("ti euro", "10 euros"), + ("ti euro", "10 euro"), ("hundrede kroner", "100 kroner"), - ("€10", "10 euros"), + ("€10", "10 euro"), ("10 kr", "10 kroner"), ("£50", "50 pund"), - ("fem dollar", "5 dollars"), + ("fem dollar", "5 dollar"), ], ) def test_currency_symbols_and_plural_trailing_words( diff --git a/tests/unit/languages/danish_operators_test.py b/tests/unit/languages/danish_operators_test.py index d1e015b..41d87ec 100644 --- a/tests/unit/languages/danish_operators_test.py +++ b/tests/unit/languages/danish_operators_test.py @@ -23,5 +23,5 @@ def test_config_code(operators: DanishOperators) -> None: def test_word_replacements(operators: DanishOperators) -> None: - assert operators.get_word_replacements()["euro"] == "euros" assert operators.get_word_replacements()["krone"] == "kroner" + assert "euro" not in operators.get_word_replacements() From 3c2e0137622a6146c7bec87ff32c8d9a4ccc6a3e Mon Sep 17 00:00:00 2001 From: karamouche Date: Wed, 27 May 2026 16:16:49 -0400 Subject: [PATCH 3/3] fix: refine Danish filler words --- normalization/languages/danish/operators.py | 4 +++- tests/unit/languages/danish_number_normalizer_test.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/normalization/languages/danish/operators.py b/normalization/languages/danish/operators.py index f0f42d6..dd2df4a 100644 --- a/normalization/languages/danish/operators.py +++ b/normalization/languages/danish/operators.py @@ -44,8 +44,10 @@ filler_words=[ # Post-diacritics forms: remove_diacritics runs before remove_filler_words, # so ø→o and å→a conversions have already happened at match time. + # "ohm" (from "øhm") is intentionally excluded: after diacritics removal it + # collides with the SI unit "ohm", which would be silently deleted in phrases + # like "modstanden er 10 ohm". "oh", # from "øh" - "ohm", # from "øhm" "hm", "hmm", "mm", diff --git a/tests/unit/languages/danish_number_normalizer_test.py b/tests/unit/languages/danish_number_normalizer_test.py index 845d9ed..a25c6c4 100644 --- a/tests/unit/languages/danish_number_normalizer_test.py +++ b/tests/unit/languages/danish_number_normalizer_test.py @@ -40,6 +40,7 @@ def normalizer_no_currency() -> DanishNumberNormalizer: ("fem og firs", "85"), # Glued og-compounds ("enogtyve", "21"), + ("etogtyve", "21"), # neuter-"et" glued form ("toogtyve", "22"), ("fireogfirs", "84"), ("nioghalvfems", "99"), @@ -55,6 +56,7 @@ def normalizer_no_currency() -> DanishNumberNormalizer: # Tusind ("tusind", "1000"), ("et tusind", "1000"), + ("en tusind", "1000"), # common-gender form ("to tusind", "2000"), ("to tusind tre hundrede", "2300"), ("tres tusind", "60000"), @@ -64,6 +66,7 @@ def normalizer_no_currency() -> DanishNumberNormalizer: ("en milliard", "1000000000"), ("to milliarder", "2000000000"), ("en billion", "1000000000000"), + ("to billioner", "2000000000000"), # Mixed digit + word ("3 milliard", "3000000000"), ("5 million", "5000000"),