From e832efd97435b7dc7acc982f5a5aa4940c40306e Mon Sep 17 00:00:00 2001
From: karamouche <hugo4ibt@gmail.com>
Date: Wed, 27 May 2026 09:16:00 -0400
Subject: [PATCH 1/3] feat: add Danish language support with number
 normalization and operators

---
 README.md                                     |   6 +-
 docs/contributing-guide.md                    |   2 +
 normalization/languages/__init__.py           |   3 +-
 normalization/languages/danish/__init__.py    |   7 +
 .../languages/danish/number_normalizer.py     | 435 ++++++++++++++++++
 normalization/languages/danish/operators.py   | 106 +++++
 .../languages/danish/replacements.py          |   6 +
 tests/e2e/files/gladia-3/da.csv               |  14 +
 .../danish_number_normalizer_test.py          | 113 +++++
 tests/unit/languages/danish_operators_test.py |  27 ++
 10 files changed, 716 insertions(+), 3 deletions(-)
 create mode 100644 normalization/languages/danish/__init__.py
 create mode 100644 normalization/languages/danish/number_normalizer.py
 create mode 100644 normalization/languages/danish/operators.py
 create mode 100644 normalization/languages/danish/replacements.py
 create mode 100644 tests/e2e/files/gladia-3/da.csv
 create mode 100644 tests/unit/languages/danish_number_normalizer_test.py
 create mode 100644 tests/unit/languages/danish_operators_test.py

diff --git a/README.md b/README.md
index 724e1fd..c8874bf 100644
--- a/README.md
+++ b/README.md
@@ -112,11 +112,13 @@ Pipelines are defined declaratively in **YAML presets**. Each preset lists the s
 
 | Code | Language |
 | ---- | -------- |
+| `da` | Danish   |
+| `de` | German   |
 | `en` | English  |
+| `es` | Spanish  |
+| `fi` | Finnish  |
 | `fr` | French   |
-| `de` | German   |
 | `it` | Italian  |
-| `es` | Spanish  |
 | `nl` | Dutch    |
 | `sv` | Swedish  |
 
diff --git a/docs/contributing-guide.md b/docs/contributing-guide.md
index e271b87..e57407f 100644
--- a/docs/contributing-guide.md
+++ b/docs/contributing-guide.md
@@ -167,9 +167,11 @@ E2E tests validate the full pipeline (preset + language) against CSV fixtures. T
 tests/e2e/files/
   gladia-3/
     default.csv
+    da.csv
     de.csv
     en.csv
     es.csv
+    fi.csv
     fr.csv
     it.csv
     nl.csv
diff --git a/normalization/languages/__init__.py b/normalization/languages/__init__.py
index bfa6a54..7739867 100644
--- a/normalization/languages/__init__.py
+++ b/normalization/languages/__init__.py
@@ -1,10 +1,11 @@
-from . import dutch, english, finnish, french, german, italian, spanish, swedish
+from . import danish, dutch, english, finnish, french, german, italian, spanish, swedish
 from .base import LanguageOperators
 from .registry import get_language_registry, register_language
 
 register_language(LanguageOperators)
 
 __all__ = [
+    "danish",
     "dutch",
     "english",
     "finnish",
diff --git a/normalization/languages/danish/__init__.py b/normalization/languages/danish/__init__.py
new file mode 100644
index 0000000..b2eee1b
--- /dev/null
+++ b/normalization/languages/danish/__init__.py
@@ -0,0 +1,7 @@
+from .operators import DanishOperators
+from .replacements import DANISH_REPLACEMENTS
+
+__all__ = [
+    "DanishOperators",
+    "DANISH_REPLACEMENTS",
+]
diff --git a/normalization/languages/danish/number_normalizer.py b/normalization/languages/danish/number_normalizer.py
new file mode 100644
index 0000000..9117417
--- /dev/null
+++ b/normalization/languages/danish/number_normalizer.py
@@ -0,0 +1,435 @@
+"""Danish number normalizer (STT-oriented).
+
+``text2num.alpha2digit`` does not support Danish, so this module implements
+spelled-out cardinal parsing for common transcript patterns: 0–999, ``tusind``
+compounds, and large multipliers (``million``, ``milliard``, ``billion``).
+Danish uses the vigesimal (base-20) system for 50–90 and forms compound numbers
+as ``"X og Y"`` (e.g. ``"en og tyve"`` = 21) or as glued words (``"enogtyve"``).
+Optionally rewrites currency symbols then restores plural currency words from config.
+"""
+
+from __future__ import annotations
+
+import re
+
+
+def _fold(s: str) -> str:
+    return s.lower()
+
+
+def _get(table: dict[str, int], word: str) -> int | None:
+    fw = _fold(word)
+    for k, v in table.items():
+        if _fold(k) == fw:
+            return v
+    return None
+
+
+_ONES_2_9: dict[str, int] = {
+    "to": 2,
+    "tre": 3,
+    "fire": 4,
+    "fem": 5,
+    "seks": 6,
+    "syv": 7,
+    "otte": 8,
+    "ni": 9,
+}
+
+_TEENS: dict[str, int] = {
+    "ti": 10,
+    "elleve": 11,
+    "tolv": 12,
+    "tretten": 13,
+    "fjorten": 14,
+    "femten": 15,
+    "seksten": 16,
+    "sytten": 17,
+    "atten": 18,
+    "nitten": 19,
+}
+
+_TENS: dict[str, int] = {
+    "tyve": 20,
+    "tredive": 30,
+    "tredve": 30,  # colloquial spoken variant
+    "fyrre": 40,
+    "halvtreds": 50,
+    "tres": 60,
+    "halvfjerds": 70,
+    "firs": 80,
+    "halvfems": 90,
+}
+
+# Used for og-compound ones part (includes en/et = 1 alongside 2-9)
+_ONES_FOR_OG: dict[str, int] = {"en": 1, "et": 1, **_ONES_2_9}
+
+_DIGIT_TO_DANISH: dict[str, str] = {
+    "0": "nul",
+    "1": "en",
+    "2": "to",
+    "3": "tre",
+    "4": "fire",
+    "5": "fem",
+    "6": "seks",
+    "7": "syv",
+    "8": "otte",
+    "9": "ni",
+}
+
+_RE_MIXED_NUMBER = re.compile(
+    r"\b(\d+)\s+("
+    r"milliard|milliarder|million|millioner|billion|billioner|tusind"
+    r")\b",
+    re.IGNORECASE,
+)
+
+_BIG_MULT: dict[str, int] = {
+    "tusind": 1_000,
+    "million": 1_000_000,
+    "millioner": 1_000_000,
+    "milliard": 1_000_000_000,
+    "milliarder": 1_000_000_000,
+    "billion": 1_000_000_000_000,
+    "billioner": 1_000_000_000_000,
+}
+
+# Sorted longest-first so "otte" is tried before "et" etc. in glued-compound detection.
+_ONES_FOR_OG_SORTED: tuple[tuple[str, int], ...] = tuple(
+    sorted(_ONES_FOR_OG.items(), key=lambda kv: len(kv[0]), reverse=True)
+)
+
+
+def _try_parse_og_compound(word: str) -> int | None:
+    """Parse a glued Danish ones-og-tens compound like ``'enogtyve'`` = 21."""
+    fw = _fold(word)
+    for ones_str, ones_val in _ONES_FOR_OG_SORTED:
+        prefix = ones_str + "og"
+        if fw.startswith(prefix):
+            rest = fw[len(prefix) :]
+            tens_val = _TENS.get(rest)
+            if tens_val is not None:
+                return tens_val + ones_val
+    return None
+
+
+def _normalize_mixed_numbers(text: str) -> str:
+    """Convert ``3 milliard`` → ``tre milliard`` so the word parser yields 3 000 000 000."""
+
+    def replace(match: re.Match[str]) -> str:
+        number = match.group(1)
+        multiplier = match.group(2)
+        if len(number) == 1 and number in _DIGIT_TO_DANISH:
+            return f"{_DIGIT_TO_DANISH[number]} {multiplier}"
+        return match.group(0)
+
+    return _RE_MIXED_NUMBER.sub(replace, text)
+
+
+def _singular_spoken_unit(trailing_word: str) -> str:
+    t = trailing_word.lower()
+    if t == "euros":
+        return "euro"
+    if t == "dollars":
+        return "dollar"
+    if t == "pund":
+        return "pund"  # invariant in Danish (same singular and plural)
+    if t == "cent":
+        return "cent"
+    if t == "yen":
+        return "yen"
+    if t == "kroner":
+        return "krone"
+    return trailing_word
+
+
+def _normalize_currency_symbols(
+    text: str,
+    currency_symbol_to_word: dict[str, str] | None,
+) -> str:
+    if not currency_symbol_to_word:
+        return text
+    num = r"\d+(?:[.,]\d+)?"
+    for symbol, trailing in currency_symbol_to_word.items():
+        singular = _singular_spoken_unit(trailing)
+        esc = re.escape(symbol)
+        sym = rf"\b{esc}\b" if len(symbol) > 1 else esc
+        text = re.sub(rf"{sym}\s*({num})", rf"\1 {singular}", text, flags=re.IGNORECASE)
+        text = re.sub(rf"({num})\s*{sym}", rf"\1 {singular}", text, flags=re.IGNORECASE)
+    return text
+
+
+def _currency_plural_fix_patterns(
+    currency_symbol_to_word: dict[str, str] | None,
+) -> tuple[tuple[re.Pattern[str], str], ...]:
+    if not currency_symbol_to_word:
+        return ()
+    amount = r"(\d+(?:[.,]\d+)?)"
+    seen: set[str] = set()
+    out: list[tuple[re.Pattern[str], str]] = []
+    for _symbol, trailing in currency_symbol_to_word.items():
+        tl = trailing.lower()
+        if tl in seen:
+            continue
+        seen.add(tl)
+        singular = _singular_spoken_unit(trailing)
+        if singular.lower() == tl:
+            continue
+        if tl == "euros":
+            pat = re.compile(rf"\b{amount}\s+euro(?:'s)?\b", re.IGNORECASE)
+            out.append((pat, rf"\1 {trailing}"))
+        elif tl == "kroner":
+            pat = re.compile(rf"\b{amount}\s+krone\b", re.IGNORECASE)
+            out.append((pat, rf"\1 {trailing}"))
+        else:
+            pat = re.compile(
+                rf"\b{amount}\s+{re.escape(singular)}\b",
+                re.IGNORECASE,
+            )
+            out.append((pat, rf"\1 {trailing}"))
+    return tuple(out)
+
+
+def _apply_currency_plural_fixes(
+    text: str,
+    fixers: tuple[tuple[re.Pattern[str], str], ...],
+) -> str:
+    for pattern, repl in fixers:
+        text = pattern.sub(repl, text)
+    return text
+
+
+def _hundred_multiplier(word: str) -> int | None:
+    if _fold(word) in ("en", "et"):
+        return 1
+    return _get(_ONES_2_9, word)
+
+
+class DanishNumberNormalizer:
+    """Convert Danish spelled-out numbers to digits."""
+
+    def __init__(self, currency_symbol_to_word: dict[str, str] | None = None) -> None:
+        self._currency_symbol_to_word = currency_symbol_to_word
+        self._currency_plural_fixes = _currency_plural_fix_patterns(
+            currency_symbol_to_word
+        )
+
+    def __call__(self, text: str) -> str:
+        if not text.strip():
+            return text
+        text = _normalize_currency_symbols(text, self._currency_symbol_to_word)
+        text = _normalize_mixed_numbers(text)
+        words = text.split()
+        out: list[str] = []
+        i = 0
+        n = len(words)
+        while i < n:
+            parsed = self._parse_number(words, i, n)
+            if parsed is not None:
+                end, value = parsed
+                out.append(str(value))
+                i = end
+            else:
+                out.append(words[i])
+                i += 1
+        text = " ".join(out)
+        text = _apply_currency_plural_fixes(text, self._currency_plural_fixes)
+        return text
+
+    def _parse_number(self, words: list[str], i: int, n: int) -> tuple[int, int] | None:
+        if i >= n:
+            return None
+
+        fw = _fold(words[i])
+
+        if fw == "tusind":
+            tail = self._parse_number(words, i + 1, n)
+            if tail is not None:
+                end, v2 = tail
+                return end, 1000 + v2
+            return i + 1, 1000
+
+        if i + 1 < n and fw in ("en", "et") and _fold(words[i + 1]) == "tusind":
+            tail = self._parse_number(words, i + 2, n)
+            base = 1000
+            if tail is not None:
+                end, v2 = tail
+                return end, base + v2
+            return i + 2, base
+
+        if (
+            i + 1 < n
+            and fw in ("en", "et")
+            and _fold(words[i + 1])
+            in (
+                "million",
+                "millioner",
+            )
+        ):
+            tail = self._parse_number(words, i + 2, n)
+            base = 1_000_000
+            if tail is not None:
+                end, v2 = tail
+                return end, base + v2
+            return i + 2, base
+
+        if (
+            i + 1 < n
+            and fw in ("en", "et")
+            and _fold(words[i + 1])
+            in (
+                "milliard",
+                "milliarder",
+            )
+        ):
+            tail = self._parse_number(words, i + 2, n)
+            base = 1_000_000_000
+            if tail is not None:
+                end, v2 = tail
+                return end, base + v2
+            return i + 2, base
+
+        if (
+            i + 1 < n
+            and fw in ("en", "et")
+            and _fold(words[i + 1])
+            in (
+                "billion",
+                "billioner",
+            )
+        ):
+            tail = self._parse_number(words, i + 2, n)
+            base = 1_000_000_000_000
+            if tail is not None:
+                end, v2 = tail
+                return end, base + v2
+            return i + 2, base
+
+        sub999 = self._parse_0_999(words, i, n)
+        if sub999 is None:
+            return None
+        j, v = sub999
+        if j >= n:
+            return j, v
+
+        next_fw = _fold(words[j])
+        if next_fw == "tusind":
+            j += 1
+            prod = v * 1000
+            if j >= n:
+                return j, prod
+            tail = self._parse_number(words, j, n)
+            if tail is not None:
+                end, v2 = tail
+                return end, prod + v2
+            return j, prod
+
+        mult = _BIG_MULT.get(next_fw)
+        if mult is not None and mult >= 1_000_000:
+            j += 1
+            prod = v * mult
+            if j >= n:
+                return j, prod
+            tail = self._parse_number(words, j, n)
+            if tail is not None:
+                end, v2 = tail
+                return end, prod + v2
+            return j, prod
+
+        return j, v
+
+    def _parse_0_999(self, words: list[str], i: int, n: int) -> tuple[int, int] | None:
+        if i >= n:
+            return None
+
+        fw = _fold(words[i])
+
+        if fw == "nul":
+            if i + 1 < n and self._continues_number(words[i + 1]):
+                return None
+            return i + 1, 0
+
+        if fw == "hundrede":
+            j = i + 1
+            if j < n and _fold(words[j]) == "og":
+                j += 1
+            tail = self._parse_0_99_after_og(words, j, n)
+            if tail is not None:
+                je, tv = tail
+                return je, 100 + tv
+            return i + 1, 100
+
+        if i + 1 < n and _fold(words[i + 1]) == "hundrede":
+            m = _hundred_multiplier(words[i])
+            if m is None:
+                return None
+            base = m * 100
+            j = i + 2
+            if j < n and _fold(words[j]) == "og":
+                j += 1
+            tail = self._parse_0_99_after_og(words, j, n)
+            if tail is not None:
+                je, tv = tail
+                return je, base + tv
+            return j, base
+
+        return self._parse_0_99(words, i, n)
+
+    def _continues_number(self, word: str) -> bool:
+        fw = _fold(word)
+        if fw in ("hundrede", "tusind"):
+            return True
+        if fw in _BIG_MULT:
+            return True
+        if _get(_TEENS, word) is not None:
+            return True
+        if _get(_TENS, word) is not None:
+            return True
+        if _get(_ONES_2_9, word) is not None:
+            return True
+        if fw in ("en", "et"):
+            return True
+        return False
+
+    def _parse_0_99_after_og(
+        self, words: list[str], i: int, n: int
+    ) -> tuple[int, int] | None:
+        """Parse 0–99, additionally accepting standalone ``en``/``et`` as 1 after ``og``."""
+        result = self._parse_0_99(words, i, n)
+        if result is not None:
+            return result
+        if i < n and _fold(words[i]) in ("en", "et"):
+            return i + 1, 1
+        return None
+
+    def _parse_0_99(self, words: list[str], i: int, n: int) -> tuple[int, int] | None:
+        if i >= n:
+            return None
+
+        fw = _fold(words[i])
+
+        compound = _try_parse_og_compound(words[i])
+        if compound is not None:
+            return i + 1, compound
+
+        v = _get(_TEENS, words[i])
+        if v is not None:
+            return i + 1, v
+
+        tens = _get(_TENS, words[i])
+        if tens is not None:
+            return i + 1, tens
+
+        # Three-word og-compound: "en og tyve" = 21
+        ones_val = _ONES_FOR_OG.get(fw)
+        if ones_val is not None and i + 2 < n and _fold(words[i + 1]) == "og":
+            tens_val = _get(_TENS, words[i + 2])
+            if tens_val is not None:
+                return i + 3, tens_val + ones_val
+
+        o = _get(_ONES_2_9, words[i])
+        if o is not None:
+            return i + 1, o
+
+        # Standalone "en"/"et" → do not convert (would be article in normal prose)
+        return None
diff --git a/normalization/languages/danish/operators.py b/normalization/languages/danish/operators.py
new file mode 100644
index 0000000..7353a7c
--- /dev/null
+++ b/normalization/languages/danish/operators.py
@@ -0,0 +1,106 @@
+from normalization.languages.base import LanguageConfig, LanguageOperators
+from normalization.languages.danish.number_normalizer import DanishNumberNormalizer
+from normalization.languages.registry import register_language
+
+_DANISH_DIGIT_WORDS: dict[str, str] = {
+    "nul": "0",
+    "en": "1",
+    "et": "1",
+    "to": "2",
+    "tre": "3",
+    "fire": "4",
+    "fem": "5",
+    "seks": "6",
+    "syv": "7",
+    "otte": "8",
+    "ni": "9",
+}
+
+DANISH_CONFIG = LanguageConfig(
+    code="da",
+    decimal_separator=",",
+    decimal_word="komma",
+    thousand_separator=".",
+    symbols_to_words={
+        "@": "snabel a",
+        ".": "punktum",
+        "+": "plus",
+        "=": "er lig med",
+        ">": "større end",
+        "<": "mindre end",
+        "°": "grader",
+        "°C": "grader celsius",
+        "°F": "grader fahrenheit",
+        "%": "procent",
+    },
+    currency_symbol_to_word={
+        "€": "euros",
+        "$": "dollars",
+        "£": "pund",
+        "¢": "cent",
+        "¥": "yen",
+        "kr": "kroner",
+    },
+    filler_words=[
+        # Post-diacritics forms: remove_diacritics runs before remove_filler_words,
+        # so ø→o and å→a conversions have already happened at match time.
+        "oh",  # from "øh"
+        "ohm",  # from "øhm"
+        "hm",
+        "hmm",
+        "mm",
+        "mhm",
+        "altsa",  # from "altså"
+        "na",  # from "nå"
+    ],
+    digit_words=_DANISH_DIGIT_WORDS,
+    number_words=[
+        *_DANISH_DIGIT_WORDS,
+        "ti",
+        "elleve",
+        "tolv",
+        "tretten",
+        "fjorten",
+        "femten",
+        "seksten",
+        "sytten",
+        "atten",
+        "nitten",
+        "tyve",
+        "tredive",
+        "tredve",
+        "fyrre",
+        "halvtreds",
+        "tres",
+        "halvfjerds",
+        "firs",
+        "halvfems",
+        "hundrede",
+        "tusind",
+        "million",
+        "millioner",
+        "milliard",
+        "milliarder",
+        "billion",
+        "billioner",
+    ],
+    plus_word="plus",
+)
+
+
+@register_language
+class DanishOperators(LanguageOperators):
+    def __init__(self) -> None:
+        super().__init__(DANISH_CONFIG)
+        self._number_normalizer = DanishNumberNormalizer(
+            DANISH_CONFIG.currency_symbol_to_word,
+        )
+
+    def expand_written_numbers(self, text: str) -> str:
+        """Convert Danish spelled-out numbers to digits (e.g. en og tyve → 21)."""
+        return self._number_normalizer(text)
+
+    def get_word_replacements(self) -> dict[str, str]:
+        from normalization.languages.danish.replacements import DANISH_REPLACEMENTS
+
+        return DANISH_REPLACEMENTS
diff --git a/normalization/languages/danish/replacements.py b/normalization/languages/danish/replacements.py
new file mode 100644
index 0000000..7536f2b
--- /dev/null
+++ b/normalization/languages/danish/replacements.py
@@ -0,0 +1,6 @@
+"""Single-token colloquial / spelling variants → standard Danish (canonical for WER)."""
+
+DANISH_REPLACEMENTS: dict[str, str] = {
+    "euro": "euros",
+    "krone": "kroner",
+}
diff --git a/tests/e2e/files/gladia-3/da.csv b/tests/e2e/files/gladia-3/da.csv
new file mode 100644
index 0000000..c334516
--- /dev/null
+++ b/tests/e2e/files/gladia-3/da.csv
@@ -0,0 +1,14 @@
+input,expected
+ti euro,10 euros
+2 < 5,2 mindre end 5
+50°C,50 grader celsius
+Det koster €50,det koster 50 euros
+en og tyve kroner,21 kroner
+10 kr,10 kroner
+øh hej,hej
+test@example.com,test snabel a example punktum com
+x = 5,x er lig med 5
+"1.234,56",1234 komma 56
+"3,14",3 komma 14
+en million,1000000
+ping pong,ping pong
diff --git a/tests/unit/languages/danish_number_normalizer_test.py b/tests/unit/languages/danish_number_normalizer_test.py
new file mode 100644
index 0000000..b8de6c6
--- /dev/null
+++ b/tests/unit/languages/danish_number_normalizer_test.py
@@ -0,0 +1,113 @@
+import pytest
+
+from normalization.languages.danish.number_normalizer import DanishNumberNormalizer
+from normalization.languages.danish.operators import DANISH_CONFIG
+
+
+@pytest.fixture
+def normalizer() -> DanishNumberNormalizer:
+    return DanishNumberNormalizer(DANISH_CONFIG.currency_symbol_to_word)
+
+
+@pytest.fixture
+def normalizer_no_currency() -> DanishNumberNormalizer:
+    return DanishNumberNormalizer(None)
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        # Basic digits and teens
+        ("ni", "9"),
+        ("elleve", "11"),
+        ("femten", "15"),
+        ("nitten", "19"),
+        # Vigesimal tens (unique to Danish)
+        ("tyve", "20"),
+        ("tredive", "30"),
+        ("tredve", "30"),  # colloquial variant
+        ("fyrre", "40"),
+        ("halvtreds", "50"),
+        ("tres", "60"),
+        ("halvfjerds", "70"),
+        ("firs", "80"),
+        ("halvfems", "90"),
+        # Three-word og-compounds
+        ("en og tyve", "21"),
+        ("to og tyve", "22"),
+        ("tre og halvtreds", "53"),
+        ("ni og halvfems", "99"),
+        ("fem og firs", "85"),
+        # Glued og-compounds
+        ("enogtyve", "21"),
+        ("toogtyve", "22"),
+        ("fireogfirs", "84"),
+        ("nioghalvfems", "99"),
+        ("treoghalvfjerds", "73"),
+        # Hundreds
+        ("hundrede", "100"),
+        ("to hundrede", "200"),
+        ("ni hundrede", "900"),
+        ("to hundrede og en", "201"),
+        ("tre hundrede og femten", "315"),
+        ("to hundrede og en og tyve", "221"),
+        ("to hundrede og fireoghalvtreds", "254"),
+        # Tusind
+        ("tusind", "1000"),
+        ("et tusind", "1000"),
+        ("to tusind", "2000"),
+        ("to tusind tre hundrede", "2300"),
+        ("tres tusind", "60000"),
+        # Large multipliers
+        ("en million", "1000000"),
+        ("to millioner", "2000000"),
+        ("en milliard", "1000000000"),
+        ("to milliarder", "2000000000"),
+        ("en billion", "1000000000000"),
+        # Mixed digit + word
+        ("3 milliard", "3000000000"),
+        ("5 million", "5000000"),
+        # Zero
+        ("nul", "0"),
+    ],
+)
+def test_danish_spelled_numbers(
+    normalizer: DanishNumberNormalizer, text: str, expected: str
+) -> None:
+    assert normalizer(text) == expected
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        ("ti euro", "10 euros"),
+        ("hundrede kroner", "100 kroner"),
+        ("€10", "10 euros"),
+        ("10 kr", "10 kroner"),
+        ("£50", "50 pund"),
+        ("fem dollar", "5 dollars"),
+    ],
+)
+def test_currency_symbols_and_plural_trailing_words(
+    normalizer: DanishNumberNormalizer, text: str, expected: str
+) -> None:
+    assert normalizer(text) == expected
+
+
+def test_without_currency_config_leaves_currency_symbol(
+    normalizer_no_currency: DanishNumberNormalizer,
+) -> None:
+    assert normalizer_no_currency("en og tyve") == "21"
+    assert normalizer_no_currency("€10") == "€10"
+    assert normalizer_no_currency("3 milliard") == "3000000000"
+
+
+def test_non_numeric_text_unchanged(normalizer: DanishNumberNormalizer) -> None:
+    text = "det her er almindelig tekst"
+    assert normalizer(text) == text
+
+
+def test_kroner_word_not_treated_as_currency_suffix(
+    normalizer: DanishNumberNormalizer,
+) -> None:
+    assert normalizer("25 kroner") == "25 kroner"
diff --git a/tests/unit/languages/danish_operators_test.py b/tests/unit/languages/danish_operators_test.py
new file mode 100644
index 0000000..d1e015b
--- /dev/null
+++ b/tests/unit/languages/danish_operators_test.py
@@ -0,0 +1,27 @@
+import pytest
+
+from normalization.languages.danish.operators import DanishOperators
+from normalization.languages.registry import get_language_registry
+
+
+@pytest.fixture
+def operators() -> DanishOperators:
+    return DanishOperators()
+
+
+def test_danish_is_registered() -> None:
+    assert "da" in get_language_registry()
+
+
+def test_danish_registry_produces_danish_operators() -> None:
+    instance = get_language_registry()["da"]()
+    assert isinstance(instance, DanishOperators)
+
+
+def test_config_code(operators: DanishOperators) -> None:
+    assert operators.config.code == "da"
+
+
+def test_word_replacements(operators: DanishOperators) -> None:
+    assert operators.get_word_replacements()["euro"] == "euros"
+    assert operators.get_word_replacements()["krone"] == "kroner"

From 8aa482c88de3925decefacd1aef082f13151a0cb Mon Sep 17 00:00:00 2001
From: karamouche <hugo4ibt@gmail.com>
Date: Wed, 27 May 2026 15:39:28 -0400
Subject: [PATCH 2/3] fix: update Danish currency terms to singular form in
 normalizer and tests

---
 .../languages/danish/number_normalizer.py         | 15 +--------------
 normalization/languages/danish/operators.py       |  4 ++--
 normalization/languages/danish/replacements.py    |  1 -
 tests/e2e/files/gladia-3/da.csv                   |  4 ++--
 .../languages/danish_number_normalizer_test.py    |  6 +++---
 tests/unit/languages/danish_operators_test.py     |  2 +-
 6 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/normalization/languages/danish/number_normalizer.py b/normalization/languages/danish/number_normalizer.py
index 9117417..c08325b 100644
--- a/normalization/languages/danish/number_normalizer.py
+++ b/normalization/languages/danish/number_normalizer.py
@@ -128,16 +128,6 @@ def replace(match: re.Match[str]) -> str:
 
 def _singular_spoken_unit(trailing_word: str) -> str:
     t = trailing_word.lower()
-    if t == "euros":
-        return "euro"
-    if t == "dollars":
-        return "dollar"
-    if t == "pund":
-        return "pund"  # invariant in Danish (same singular and plural)
-    if t == "cent":
-        return "cent"
-    if t == "yen":
-        return "yen"
     if t == "kroner":
         return "krone"
     return trailing_word
@@ -175,10 +165,7 @@ def _currency_plural_fix_patterns(
         singular = _singular_spoken_unit(trailing)
         if singular.lower() == tl:
             continue
-        if tl == "euros":
-            pat = re.compile(rf"\b{amount}\s+euro(?:'s)?\b", re.IGNORECASE)
-            out.append((pat, rf"\1 {trailing}"))
-        elif tl == "kroner":
+        if tl == "kroner":
             pat = re.compile(rf"\b{amount}\s+krone\b", re.IGNORECASE)
             out.append((pat, rf"\1 {trailing}"))
         else:
diff --git a/normalization/languages/danish/operators.py b/normalization/languages/danish/operators.py
index 7353a7c..f0f42d6 100644
--- a/normalization/languages/danish/operators.py
+++ b/normalization/languages/danish/operators.py
@@ -34,8 +34,8 @@
         "%": "procent",
     },
     currency_symbol_to_word={
-        "€": "euros",
-        "$": "dollars",
+        "€": "euro",
+        "$": "dollar",
         "£": "pund",
         "¢": "cent",
         "¥": "yen",
diff --git a/normalization/languages/danish/replacements.py b/normalization/languages/danish/replacements.py
index 7536f2b..658f9bd 100644
--- a/normalization/languages/danish/replacements.py
+++ b/normalization/languages/danish/replacements.py
@@ -1,6 +1,5 @@
 """Single-token colloquial / spelling variants → standard Danish (canonical for WER)."""
 
 DANISH_REPLACEMENTS: dict[str, str] = {
-    "euro": "euros",
     "krone": "kroner",
 }
diff --git a/tests/e2e/files/gladia-3/da.csv b/tests/e2e/files/gladia-3/da.csv
index c334516..de37d07 100644
--- a/tests/e2e/files/gladia-3/da.csv
+++ b/tests/e2e/files/gladia-3/da.csv
@@ -1,8 +1,8 @@
 input,expected
-ti euro,10 euros
+ti euro,10 euro
 2 < 5,2 mindre end 5
 50°C,50 grader celsius
-Det koster €50,det koster 50 euros
+Det koster €50,det koster 50 euro
 en og tyve kroner,21 kroner
 10 kr,10 kroner
 øh hej,hej
diff --git a/tests/unit/languages/danish_number_normalizer_test.py b/tests/unit/languages/danish_number_normalizer_test.py
index b8de6c6..845d9ed 100644
--- a/tests/unit/languages/danish_number_normalizer_test.py
+++ b/tests/unit/languages/danish_number_normalizer_test.py
@@ -80,12 +80,12 @@ def test_danish_spelled_numbers(
 @pytest.mark.parametrize(
     ("text", "expected"),
     [
-        ("ti euro", "10 euros"),
+        ("ti euro", "10 euro"),
         ("hundrede kroner", "100 kroner"),
-        ("€10", "10 euros"),
+        ("€10", "10 euro"),
         ("10 kr", "10 kroner"),
         ("£50", "50 pund"),
-        ("fem dollar", "5 dollars"),
+        ("fem dollar", "5 dollar"),
     ],
 )
 def test_currency_symbols_and_plural_trailing_words(
diff --git a/tests/unit/languages/danish_operators_test.py b/tests/unit/languages/danish_operators_test.py
index d1e015b..41d87ec 100644
--- a/tests/unit/languages/danish_operators_test.py
+++ b/tests/unit/languages/danish_operators_test.py
@@ -23,5 +23,5 @@ def test_config_code(operators: DanishOperators) -> None:
 
 
 def test_word_replacements(operators: DanishOperators) -> None:
-    assert operators.get_word_replacements()["euro"] == "euros"
     assert operators.get_word_replacements()["krone"] == "kroner"
+    assert "euro" not in operators.get_word_replacements()

From 3c2e0137622a6146c7bec87ff32c8d9a4ccc6a3e Mon Sep 17 00:00:00 2001
From: karamouche <hugo4ibt@gmail.com>
Date: Wed, 27 May 2026 16:16:49 -0400
Subject: [PATCH 3/3] fix: refine Danish filler words

---
 normalization/languages/danish/operators.py           | 4 +++-
 tests/unit/languages/danish_number_normalizer_test.py | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/normalization/languages/danish/operators.py b/normalization/languages/danish/operators.py
index f0f42d6..dd2df4a 100644
--- a/normalization/languages/danish/operators.py
+++ b/normalization/languages/danish/operators.py
@@ -44,8 +44,10 @@
     filler_words=[
         # Post-diacritics forms: remove_diacritics runs before remove_filler_words,
         # so ø→o and å→a conversions have already happened at match time.
+        # "ohm" (from "øhm") is intentionally excluded: after diacritics removal it
+        # collides with the SI unit "ohm", which would be silently deleted in phrases
+        # like "modstanden er 10 ohm".
         "oh",  # from "øh"
-        "ohm",  # from "øhm"
         "hm",
         "hmm",
         "mm",
diff --git a/tests/unit/languages/danish_number_normalizer_test.py b/tests/unit/languages/danish_number_normalizer_test.py
index 845d9ed..a25c6c4 100644
--- a/tests/unit/languages/danish_number_normalizer_test.py
+++ b/tests/unit/languages/danish_number_normalizer_test.py
@@ -40,6 +40,7 @@ def normalizer_no_currency() -> DanishNumberNormalizer:
         ("fem og firs", "85"),
         # Glued og-compounds
         ("enogtyve", "21"),
+        ("etogtyve", "21"),  # neuter-"et" glued form
         ("toogtyve", "22"),
         ("fireogfirs", "84"),
         ("nioghalvfems", "99"),
@@ -55,6 +56,7 @@ def normalizer_no_currency() -> DanishNumberNormalizer:
         # Tusind
         ("tusind", "1000"),
         ("et tusind", "1000"),
+        ("en tusind", "1000"),  # common-gender form
         ("to tusind", "2000"),
         ("to tusind tre hundrede", "2300"),
         ("tres tusind", "60000"),
@@ -64,6 +66,7 @@ def normalizer_no_currency() -> DanishNumberNormalizer:
         ("en milliard", "1000000000"),
         ("to milliarder", "2000000000"),
         ("en billion", "1000000000000"),
+        ("to billioner", "2000000000000"),
         # Mixed digit + word
         ("3 milliard", "3000000000"),
         ("5 million", "5000000"),