diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 5f72de53c..000000000 --- a/.gitignore +++ /dev/null @@ -1,177 +0,0 @@ -# log and data files -*.model -*.pkl -#*.ipynb -output -result -*.pt -tests/data/asr -.DS_Store -bert.pt.json -work -runs -fastspeech_output -.hydra -.bash_history.local - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class -**.pyc - -# C extensions -*.so - -# Distribution / packaging -.idea -.Python -wandb -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -#parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/build - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don’t work, or not -# install all needed dependencies. -#Pipfile.lock - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# VSCode project settins -.vscode/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site -/docs/html -/docs/docs_zh/zh - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# Emacs backup files -*~ - -cifar-10-batches-py -*.tar.gz - -# Test data. -tests/.data -tests/data - -# outputs folder -examples/*/outputs -examples/*/NeMo_experiments -examples/*/nemo_experiments -examples/*/.hydra -examples/*/wandb -examples/*/data -wandb -dump.py - -docs/sources/source/test_build/ - -# Checkpoints, config files and temporary files created in tutorials. -examples/neural_graphs/*.chkpt -examples/neural_graphs/*.yml - -.hydra/ -nemo_experiments/ -*.swp diff --git a/Jenkinsfile b/Jenkinsfile index 51ce37a10..253af49c2 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,6 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0' + KO_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { @@ -318,6 +319,22 @@ pipeline { } } } + stage('L0: Create KO TN Grammars') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: KO TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=ko --text="100" --cache_dir ${KO_TN_CACHE}' + } + } + } + } // L1 Tests starts here @@ -406,6 +423,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hy/ -m "not pleasefixme" --cpu --tn_cache_dir ${HY_TN_CACHE}' } } + stage('L1: Run all KO TN/ITN tests (restore grammars from cache)') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ko/ -m "not pleasefixme" --cpu --tn_cache_dir ${KO_TN_CACHE}' + } + } } } diff --git a/nemo_text_processing/text_normalization/ko/__init__.py b/nemo_text_processing/text_normalization/ko/__init__.py new file mode 100644 index 000000000..dd0e509b3 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/text_normalization/ko/data/__init__.py b/nemo_text_processing/text_normalization/ko/data/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/date/__init__.py b/nemo_text_processing/text_normalization/ko/data/date/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/date/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/date/exceptions.tsv b/nemo_text_processing/text_normalization/ko/data/date/exceptions.tsv new file mode 100644 index 000000000..2f54cee92 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/date/exceptions.tsv @@ -0,0 +1,2 @@ +6 유 +10 시 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/date/week.tsv b/nemo_text_processing/text_normalization/ko/data/date/week.tsv new file mode 100644 index 000000000..bc205bc3f --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/date/week.tsv @@ -0,0 +1,8 @@ +월 월요일 +화 화요일 +수 수요일 +목 목요일 +금 금요일 +토 토요일 +일 일요일 +공 공휴일 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/__init__.py b/nemo_text_processing/text_normalization/ko/data/electronic/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/electronic/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv new file mode 100644 index 000000000..f562cfbad --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv @@ -0,0 +1,11 @@ +카드 끝자리 카드 끝자리 +카드 마지막 네자리 카드 마지막 네자리 +카드 마지막 4자리 카드 마지막 네자리 +신용카드 번호 신용카드 번호 +신용카드 신용카드 +체크카드 번호 체크카드 번호 +체크카드 체크카드 +카드번호 카드번호 +결제 카드 결제 카드 +결제카드 결제카드 +카드 카드 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv new file mode 100644 index 000000000..3d04ca298 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv @@ -0,0 +1,28 @@ +.com 닷컴 +.org 닷 오알지 +.gov 닷 거브 +.edu 닷 에듀 +.net 닷 넷 +.ai 닷 에이아이 +.io 닷 아이오 +.dev 닷 데브 +.app 닷 앱 +.cloud 닷 클라우드 +.shop 닷 샵 +.store 닷 스토어 +.co 닷 씨오 +.me 닷 미 +.kr 닷 케이알 +.co.kr 닷 씨오 닷 케이알 +.ac.kr 닷 에이씨 닷 케이알 +.or.kr 닷 오알 닷 케이알 +.go.kr 닷 지오 닷 케이알 +.re.kr 닷 알이 닷 케이알 +.cn 닷 씨엔 +.fr 닷 에프알 +.de 닷 디이 +.it 닷 아이티 +.uk 닷 유케이 +.br 닷 비알 +.in 닷 아이엔 +.ru 닷 알유 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv new file mode 100644 index 000000000..c80d08a69 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv @@ -0,0 +1,6 @@ +.jpg 닷 제이피지 +.png 닷 피엔지 +.pdf 닷 피디에프 +.JPG 닷 제이피지 +.PNG 닷 피엔지 +.PDF 닷 피디에프 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/symbol.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/symbol.tsv new file mode 100644 index 000000000..f551dabf4 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/electronic/symbol.tsv @@ -0,0 +1,12 @@ +@ 골뱅이 +. 점 +/ 슬래시 +- 대시 +_ 언더바 +: 콜론 +? 물음표 += 이퀄 +& 앰퍼샌드 +% 퍼센트 ++ 플러스 +# 샵 diff --git a/nemo_text_processing/text_normalization/ko/data/measure/__init__.py b/nemo_text_processing/text_normalization/ko/data/measure/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/measure/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/measure/unit.tsv b/nemo_text_processing/text_normalization/ko/data/measure/unit.tsv new file mode 100644 index 000000000..ccec41e7f --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/measure/unit.tsv @@ -0,0 +1,18 @@ +kg 킬로그램 +g 그램 +km 킬로미터 +m 미터 +cm 센티미터 +mm 밀리미터 +L 리터 +l 리터 +mL 밀리리터 +ml 밀리리터 +h 시간 +s 초 +N 뉴턴 +W 와트 +Hz 헤르츠 +° 도 +% 퍼센트 +rpm 분당회전수 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/money/__init__.py b/nemo_text_processing/text_normalization/ko/data/money/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/money/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/money/currency_major.tsv b/nemo_text_processing/text_normalization/ko/data/money/currency_major.tsv new file mode 100644 index 000000000..a5ddfeb45 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/money/currency_major.tsv @@ -0,0 +1,22 @@ +₩ 원 +KRW 원 +krw 원 +$ 달러 +US$ 달러 +HK$ 홍콩 달러 +hk$ 홍콩 달러 +€ 유로 +EUR 유로 +¥ 엔 +JPY 엔 +CAD 캐나다 달러 +cad 캐나다 달러 +NZD 뉴질랜드 달러 +nzd 뉴질랜드 달러 +CHF 스위스 프랑 +chf 스위스 프랑 +AED 아랍에미리트 디르함 +aed 아랍에미리트 디르함 +Dh 디르함 +DH 디르함 +Dhs. 디르함 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/number/__init__.py b/nemo_text_processing/text_normalization/ko/data/number/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/number/digit.tsv b/nemo_text_processing/text_normalization/ko/data/number/digit.tsv new file mode 100644 index 000000000..61a7dddcf --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/digit.tsv @@ -0,0 +1,9 @@ +1 일 +2 이 +3 삼 +4 사 +5 오 +6 육 +7 칠 +8 팔 +9 구 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/number/tens.tsv b/nemo_text_processing/text_normalization/ko/data/number/tens.tsv new file mode 100644 index 000000000..d8b8e0a2b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/tens.tsv @@ -0,0 +1,9 @@ +1 십 +2 이십 +3 삼십 +4 사십 +5 오십 +6 육십 +7 칠십 +8 팔십 +9 구십 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/number/zero.tsv b/nemo_text_processing/text_normalization/ko/data/number/zero.tsv new file mode 100644 index 000000000..7024c0534 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/number/zero.tsv @@ -0,0 +1 @@ +0 영 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/ordinal/__init__.py b/nemo_text_processing/text_normalization/ko/data/ordinal/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/ordinal/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/ordinal/digit.tsv b/nemo_text_processing/text_normalization/ko/data/ordinal/digit.tsv new file mode 100644 index 000000000..b3efc4cef --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/ordinal/digit.tsv @@ -0,0 +1,8 @@ +2 두 +3 세 +4 네 +5 다섯 +6 여섯 +7 일곱 +8 여덟 +9 아홉 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/ordinal/exceptions.tsv b/nemo_text_processing/text_normalization/ko/data/ordinal/exceptions.tsv new file mode 100644 index 000000000..ad796a0e8 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/ordinal/exceptions.tsv @@ -0,0 +1,5 @@ +1 첫 +11 열한 +20 스무 +21 스물한 +31 서른한 diff --git a/nemo_text_processing/text_normalization/ko/data/ordinal/tens.tsv b/nemo_text_processing/text_normalization/ko/data/ordinal/tens.tsv new file mode 100644 index 000000000..994a21a1d --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/ordinal/tens.tsv @@ -0,0 +1,2 @@ +10 열 +30 서른 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/ordinal/tens_prefix.tsv b/nemo_text_processing/text_normalization/ko/data/ordinal/tens_prefix.tsv new file mode 100644 index 000000000..3111a82dc --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/ordinal/tens_prefix.tsv @@ -0,0 +1,3 @@ +1 열 +2 스물 +3 서른 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/time/__init__.py b/nemo_text_processing/text_normalization/ko/data/time/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/data/time/division.tsv b/nemo_text_processing/text_normalization/ko/data/time/division.tsv new file mode 100644 index 000000000..9250d0a8f --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/time/division.tsv @@ -0,0 +1,7 @@ +오전 +오후 +새벽 +아침 +낮 +저녁 +밤 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/time/hour.tsv b/nemo_text_processing/text_normalization/ko/data/time/hour.tsv new file mode 100644 index 000000000..abfccd310 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/time/hour.tsv @@ -0,0 +1,12 @@ +1 한 +2 두 +3 세 +4 네 +5 다섯 +6 여섯 +7 일곱 +8 여덟 +9 아홉 +10 열 +11 열한 +12 열두 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/whitelist.tsv b/nemo_text_processing/text_normalization/ko/data/whitelist.tsv new file mode 100644 index 000000000..82dc1220e --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/whitelist.tsv @@ -0,0 +1,38 @@ +Dr. 박사 +dr. 박사 +Mr. 씨 +mr. 씨 +Ms. 양 +ms. 양 +Mrs. 여사 +mrs. 여사 +mt. 산 +Mt. 산 +Prof. 교수 +prof. 교수 +sr. 시니어 +Sr. 시니어 +jr. 주니어 +Jr. 주니어 +rd. 로 +Rd. 로 +Ave. 대로 +ave. 대로 +no. 번호 +No. 번호 +( 왼쪽 괄호 +) 오른쪽 괄호 ++ 더하기 +- 마이너스 +Σ 시그마 +η 에타 +κ 카파 +ω 오메가 +σ 시그마 +α 알파 +ν 뉴 +δ 델타 +ι 이오타 +vs. 대 +Ph.D. 박사학위 +etc. 등 diff --git a/nemo_text_processing/text_normalization/ko/graph_utils.py b/nemo_text_processing/text_normalization/ko/graph_utils.py new file mode 100644 index 000000000..9db51238f --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/graph_utils.py @@ -0,0 +1,173 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.text_normalization.en.utils import load_labels +from nemo_text_processing.utils.logging import logger + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_DIGIT = byte.DIGIT +NEMO_ALPHA = pynini.union(*[chr(i) for i in range(ord('가'), ord('힣') + 1)]).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"')) +) + + +# Common string literals; expand as you see fit. +username_string = "username" +double_quotes = '"' +domain_string = "domain" +protocol_string = "protocol" +slash = "/" +double_slash = "//" +triple_slash = "///" +file = "file" +period = "." +at = "@" +colon = ":" +https = "https" +http = "http" +www = "www" + + +def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logger.info(f"Created {file_name}") + + +def convert_space(fst) -> "pynini.FstLike": + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, "\u00a0"), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = "lower_cased"): + labels = load_labels(input_file) + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/ko/taggers/__init__.py b/nemo_text_processing/text_normalization/ko/taggers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py new file mode 100644 index 000000000..84f389b9d --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py @@ -0,0 +1,301 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, NEMO_SIGMA, NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class CardinalFst(GraphFst): + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="classify", deterministic=deterministic) + + # Optional small whitespace inside parentheses or after signs + ws = pynini.closure(NEMO_SPACE, 0, 2) + + # Load base .tsv files + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + + digit_except_one = pynini.difference(NEMO_DIGIT, "1") + digit_except_zero_one = pynini.difference(digit_except_one, "0") + + graph_digit_no_zero_one = digit_except_zero_one @ graph_digit + graph_tens = pynini.string_file(get_abs_path("data/number/tens.tsv")) + + # Compose all basic number forms + graph_1_to_99 = (graph_tens + (graph_digit | pynutil.delete('0'))) | graph_digit + + hundreds = NEMO_DIGIT**3 + graph_hundred_component = ( + pynini.cross('1', '백') | (graph_digit_no_zero_one + pynutil.insert('백')) + ) + pynini.union(pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_1_to_99)) + graph_hundred = hundreds @ graph_hundred_component + + thousands = NEMO_DIGIT**4 + graph_thousand_component = pynini.union( + pynini.cross('1', '천'), + graph_digit_no_zero_one + pynutil.insert('천'), + ) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_thousand = thousands @ graph_thousand_component + + ten_thousands = NEMO_DIGIT**5 + graph_ten_thousand_component = pynini.union( + pynini.cross('1', '만'), + graph_digit_no_zero_one + pynutil.insert('만'), + ) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_ten_thousand = ten_thousands @ graph_ten_thousand_component + + hundred_thousands = NEMO_DIGIT**6 + graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert("만")) + pynini.union( + pynini.closure(pynutil.delete("0")), + graph_thousand_component, + (pynutil.delete("0") + graph_hundred_component), + (pynini.closure(pynutil.delete("0")) + graph_1_to_99), + ) + graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component + + millions = NEMO_DIGIT**7 + graph_million_component = ((graph_hundred) + pynutil.insert('만')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_million = millions @ graph_million_component + + ten_millions = NEMO_DIGIT**8 + graph_ten_million_component = ((graph_thousand) + pynutil.insert('만')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_thousand_component, + (pynutil.delete('0') + graph_hundred_component), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_ten_million = ten_millions @ graph_ten_million_component + + hundred_millions = NEMO_DIGIT**9 + graph_hundred_million_component = (graph_digit + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_hundred_million = hundred_millions @ graph_hundred_million_component + + thousand_millions = NEMO_DIGIT**10 + graph_thousand_million_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_thousand_million = thousand_millions @ graph_thousand_million_component + + billions = NEMO_DIGIT**11 + graph_billions_component = ((graph_hundred) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_billions = billions @ graph_billions_component + + ten_billions = NEMO_DIGIT**12 + graph_ten_billions_component = ((graph_thousand) + pynutil.insert('억')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_million_component, + (pynutil.delete('0') + graph_million_component), + (pynutil.delete('00') + graph_hundred_thousand_component), + (pynutil.delete('000') + graph_ten_thousand_component), + (pynutil.delete('0000') + graph_thousand_component), + ((pynutil.delete('00000') + graph_hundred_component)), + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_ten_billions = ten_billions @ graph_ten_billions_component + + hundred_billions = NEMO_DIGIT**13 + graph_hundred_billions_component = (graph_digit + pynutil.insert('조')) + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + graph_hundred_billions = hundred_billions @ graph_hundred_billions_component + + trillion = NEMO_DIGIT**14 + graph_trillion_component = ( + (NEMO_DIGIT**2 @ graph_1_to_99) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + ) + graph_trillions = trillion @ graph_trillion_component + + ten_trillions = NEMO_DIGIT**15 + graph_ten_trillions_component = ( + (graph_hundred) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + ) + graph_ten_trillions = ten_trillions @ graph_ten_trillions_component + + hundred_trillions = NEMO_DIGIT**16 + graph_hundred_trillions_component = ( + (graph_thousand) + + pynutil.insert('조') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_ten_billions_component, + pynutil.delete('0') + graph_billions_component, + pynutil.delete('00') + graph_thousand_million_component, + pynutil.delete('000') + graph_hundred_million_component, + pynutil.delete('0000') + graph_ten_million_component, + pynutil.delete('00000') + graph_million_component, + pynutil.delete('000000') + graph_hundred_thousand_component, + pynutil.delete('0000000') + graph_ten_thousand_component, + pynutil.delete('00000000') + graph_thousand_component, + pynutil.delete('000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + ) + graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component + + thousand_trillions = NEMO_DIGIT**17 + graph_thousand_trillions_component = ( + graph_digit + + pynutil.insert('경') + + pynini.union( + pynini.closure(pynutil.delete('0')), + graph_hundred_trillions_component, + pynutil.delete('0') + graph_ten_trillions_component, + pynutil.delete('00') + graph_trillion_component, + pynutil.delete('000') + graph_hundred_billions_component, + pynutil.delete('0000') + graph_ten_billions_component, + pynutil.delete('00000') + graph_billions_component, + pynutil.delete('000000') + graph_thousand_million_component, + pynutil.delete('0000000') + graph_hundred_million_component, + pynutil.delete('00000000') + graph_ten_million_component, + pynutil.delete('000000000') + graph_million_component, + pynutil.delete('0000000000') + graph_hundred_thousand_component, + pynutil.delete('00000000000') + graph_ten_thousand_component, + pynutil.delete('000000000000') + graph_thousand_component, + pynutil.delete('0000000000000') + graph_hundred_component, + (pynini.closure(pynutil.delete('0')) + graph_1_to_99), + ) + ) + graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component + + # FST + graph_num = pynini.union( + graph_thousand_trillions, + graph_hundred_trillions, + graph_ten_trillions, + graph_trillions, + graph_hundred_billions, + graph_ten_billions, + graph_billions, + graph_thousand_million, + graph_hundred_million, + graph_ten_million, + graph_million, + graph_hundred_thousand, + graph_ten_thousand, + graph_thousand, + graph_hundred, + graph_1_to_99, + graph_zero, + ).optimize() + + # Sign and final formatting + # Build the integer token (integer: "...") + integer_token = pynutil.insert('integer: "') + graph_num + pynutil.insert('"') + + # Sign handling: + # - minus sets negative flag + # - plus is ignored (positive number) + minus_prefix = pynutil.insert('negative: "true" ') + pynutil.delete("-") + plus_prefix = pynutil.delete("+") + + # Accounting negative: "( 1,234 )" -> negative + integer:"1234" + paren_negative = ( + pynutil.insert('negative: "true" ') + pynutil.delete("(") + ws + integer_token + ws + pynutil.delete(")") + ) + + # Signed number: optional (+|-) + integer + signed_integer = (minus_prefix | plus_prefix).ques + integer_token + + # Prefer accounting-form first, then signed form + final_graph = paren_negative | signed_integer + + # Wrap with class tokens and finalize + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() + self.graph = graph_num diff --git a/nemo_text_processing/text_normalization/ko/taggers/date.py b/nemo_text_processing/text_normalization/ko/taggers/date.py new file mode 100644 index 000000000..4f2da5702 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/date.py @@ -0,0 +1,311 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying dates in Korean, e.g. + 2024/01/30 -> date { year: "이천이십사" month: "일월" day: "삼십" } + 2024/1/30 -> date { year: "이천이십사" month: "일월" day: "삼십" } + 2024-01-30 -> date { year: "이천이십사" month: "일월" day: "삼십" } + 2024.01.30 -> date { year: "이천이십사" month: "일월" day: "삼십" } + + 기원전233년 -> date { era: "기원전" year: "이백삼십삼년" } + 기원후2024년 -> date { era: "기원후" year: "이천이십사년" } + + 21일월요일 -> tokens { date { day: "이십일일" weekday: "월요일" } } + 1970년대 -> date { year: "천구백칠십년대" } + + 1월1일(월)~3일(수) + -> tokens { date { month: "일월" day: "일일" weekday: "월요일" } } + tokens { name: "부터" } + tokens { date { day: "삼일" weekday: "수요일" } } + + 1970~1980년대 + -> tokens { cardinal { integer: "천구백칠십" } } + tokens { name: "부터" } + tokens { date { year: "천구백팔십년대" } } + + 7월5~9일(월~금) + -> tokens { date { month: "칠월" } } + tokens { cardinal { integer: "오" } } + tokens { name: "부터" } + tokens { date { day: "구일" weekday: "월요일" } } + tokens { name: "부터" } + tokens { date { weekday: "금요일" } } + + 2023년3월1일(수)~6월12일(화) + -> tokens { date { year: "이천이십삼년" month: "삼월" day: "일일" weekday: "수요일" } } + tokens { name: "부터" } + tokens { date { month: "유월" day: "십이일" weekday: "화요일" } } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="date", kind="classify", deterministic=deterministic) + + strip0 = pynini.closure(pynutil.delete("0"), 0, 1) + graph_cardinal = cardinal.graph + cardinal_lz = (strip0 + graph_cardinal).optimize() + + # Load base .tsv files + week = pynini.string_file(get_abs_path("data/date/week.tsv")) + month_exceptions = pynini.string_file(get_abs_path("data/date/exceptions.tsv")) + month_exceptions_inputs = pynini.project(month_exceptions, "input").optimize() + + # Non-exception inputs go through the generic cardinal path + graph_cardinal_non_exceptions = pynini.compose( + pynini.difference(pynini.project(graph_cardinal, "input"), month_exceptions_inputs).optimize(), + graph_cardinal, + ).optimize() + + # Month cardinal: prefer exceptions; + month_cardinal = strip0 + (month_exceptions | graph_cardinal_non_exceptions).optimize() + + era = pynini.union("기원전", "기원후").optimize() + signs = pynutil.delete("/") | pynutil.delete(".") | pynutil.delete("-") + + # Strict digit ranges for M/D/Y and Y/M/D + _d = pynini.union(*[pynini.accep(str(i)) for i in range(10)]) + _1to9 = pynini.union(*[pynini.accep(str(i)) for i in range(1, 10)]) + + # For standalone years: + # - No era: 1–4 digits with NO leading zeros + YEAR_NO_ERA_1TO4 = pynini.closure(pynutil.delete("0"), 0, 3) + _1to9 + pynini.closure(_d, 0, 3) + # - With era (기원전/기원후): allow leading zeros but strip them + YEAR_ERA_1TO4 = pynini.closure(pynutil.delete("0"), 0, 3) + _1to9 + pynini.closure(_d, 0, 3) + + # MM: 01-09 | 10-12 + MM = (pynini.accep("0") + _1to9) | (pynini.accep("1") + pynini.union("0", "1", "2")) + + # DD: 01-09 | 10-19 | 20-29 | 30-31 + DD = ( + (pynini.accep("0") + _1to9) + | (pynini.accep("1") + _d) + | (pynini.accep("2") + _d) + | (pynini.accep("3") + pynini.union("0", "1")) + ) + + # YYYY: exactly 4 digits and two-digit year for M/D/YY and D/M/YY + YYYY = pynini.union("1", "2") + _d + _d + _d + YY = _d + _d + + # Map digits -> cardinal words using existing graphs (strip leading zero via month_cardinal/cardinal_lz) + mm_to_text = pynini.compose(MM, month_cardinal).optimize() + dd_to_text = pynini.compose(DD, cardinal_lz).optimize() + yy_to_text = pynini.compose(YY, graph_cardinal).optimize() + + # Components with tags/suffixes (strict) + month_component_md = ( + pynutil.insert("month: \"") + mm_to_text + pynutil.insert("월") + pynutil.insert("\"") + ).optimize() + day_component_md = ( + pynutil.insert("day: \"") + dd_to_text + pynutil.insert("일") + pynutil.insert("\"") + ).optimize() + year_component_y2 = ( + pynutil.insert("year: \"") + yy_to_text + pynutil.insert("년") + pynutil.insert("\"") + ).optimize() + + # Generic components + era_component = pynutil.insert("era: \"") + era + pynutil.insert("\"") + + # Brackets for weekday + front_bracket = ( + pynini.closure(pynutil.delete(delete_space)) + + pynutil.delete("(") + + pynini.closure(pynutil.delete(delete_space)) + ) | ( + pynini.closure(pynutil.delete(delete_space)) + + pynutil.delete("(") + + pynini.closure(pynutil.delete(delete_space)) + ) + preceding_bracket = ( + pynini.closure(pynutil.delete(delete_space)) + + pynutil.delete(")") + + pynini.closure(pynutil.delete(delete_space)) + ) | ( + pynini.closure(pynutil.delete(delete_space)) + + pynutil.delete(")") + + pynini.closure(pynutil.delete(delete_space)) + ) + + week_component_bracketed = ( + (front_bracket + pynutil.insert("weekday: \"") + week + preceding_bracket + pynutil.insert("\"")) + | ( + front_bracket + + pynutil.insert("weekday: \"") + + week + + pynini.cross("〜", "부터") + + week + + preceding_bracket + + pynutil.insert("\"") + ) + | ( + front_bracket + + pynutil.insert("weekday: \"") + + week + + pynutil.delete("・") + + week + + preceding_bracket + + pynutil.insert("\"") + ) + ) + + week_component_plain = pynutil.insert("weekday: \"") + week + pynutil.insert("\"") + week_component = week_component_bracketed | week_component_plain + + # Strict 4-digit year component (1000–2999) + year_component_y4_strict = ( + pynutil.insert("year: \"") + (YYYY @ graph_cardinal) + pynutil.insert("년") + pynutil.insert("\"") + ).optimize() + + # Prefer strict 4-digit; still allow 2-digit with worse weight (for MM/DD/YY etc.) + year_component_md_strict = (year_component_y4_strict | pynutil.add_weight(year_component_y2, 1.0)).optimize() + + # Format: YYYY/MM/DD(weekday) + graph_basic_date = ( + pynini.closure(era_component + insert_space, 0, 1) + + year_component_y4_strict + + signs + + insert_space + + (pynutil.insert("month: \"") + month_cardinal + pynutil.insert("월") + pynutil.insert("\"")) + + signs + + insert_space + + (pynutil.insert("day: \"") + cardinal_lz + pynutil.insert("일") + pynutil.insert("\"")) + + pynini.closure(pynini.closure(insert_space, 0, 1) + week_component, 0, 1) + ) + + # American: MM/DD/YYYY + graph_american_date = ( + month_component_md + + signs + + insert_space + + day_component_md + + signs + + insert_space + + year_component_md_strict + + pynini.closure(pynini.closure(insert_space, 0, 1) + week_component, 0, 1) + ).optimize() + + # European: DD/MM/YYYY + graph_european_date = ( + day_component_md + + signs + + insert_space + + month_component_md + + signs + + insert_space + + year_component_md_strict + + pynini.closure(pynini.closure(insert_space, 0, 1) + week_component, 0, 1) + ).optimize() + + # Single elements (year/month/day) + individual_year_component = ( + # with era: (기원전|기원후) + 1~4 digits (leading zeros allowed → stripped) + ( + era_component + + insert_space + + pynutil.insert("year: \"") + + (YEAR_ERA_1TO4 @ graph_cardinal) + + pynutil.delete("년") + + pynutil.insert("년") + + pynutil.insert("\"") + ) + | + # no era: 1~4 digits, no leading zero + ( + pynutil.insert("year: \"") + + (YEAR_NO_ERA_1TO4 @ graph_cardinal) + + pynutil.delete("년") + + pynutil.insert("년") + + pynutil.insert("\"") + ) + ).optimize() + + individual_month_component = ( + pynutil.insert("month: \"") + + month_cardinal + + pynutil.delete("월") + + pynutil.insert("월") + + pynutil.insert("\"") + ) + + individual_day_component = ( + pynutil.insert("day: \"") + + cardinal_lz + + pynutil.delete("일") + + pynutil.insert("일") + + pynutil.insert("\"") + ) + + week_full_word_acceptor = pynini.project(week, "output") + week_component_full_word = pynutil.insert("weekday: \"") + week_full_word_acceptor + pynutil.insert("\"") + + day_and_weekday_component = ( + individual_day_component + pynini.closure(insert_space, 0, 1) + week_component_full_word + ) + + month_and_weekday_component = ( + individual_month_component + pynini.closure(insert_space, 0, 1) + week_component_full_word + ) + + graph_individual_component = ( + day_and_weekday_component + | month_and_weekday_component + | individual_year_component + | individual_month_component + | individual_day_component + | week_component + ) + pynini.closure(insert_space + week_component, 0, 1) + + graph_individual_component_combined = ( + (individual_year_component + insert_space + individual_month_component) + | (individual_month_component + insert_space + individual_day_component) + | ( + individual_year_component + + insert_space + + individual_month_component + + insert_space + + individual_day_component + ) + ) + pynini.closure(insert_space + week_component, 0, 1) + + nendai = pynini.accep("년대") + era_nendai = ( + pynini.closure(era_component + insert_space, 0, 1) + + pynutil.insert("year: \"") + + graph_cardinal + + nendai + + pynutil.insert("\"") + ).optimize() + + graph_all_date = ( + graph_basic_date + | graph_american_date + | graph_european_date + | graph_individual_component + | graph_individual_component_combined + | era_nendai + ).optimize() + + final_graph = self.add_tokens(graph_all_date) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/decimal.py b/nemo_text_processing/text_normalization/ko/taggers/decimal.py new file mode 100644 index 000000000..6d2d07f66 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/decimal.py @@ -0,0 +1,76 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SIGMA, NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying decimal numbers in Korean, e.g. + 1.23 -> decimal { integer_part: "일" fractional_part: "이삼" } + -0.5 -> decimal { negative: "마이너스" integer_part: "영" fractional_part: "오" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="decimal", kind="classify", deterministic=deterministic) + + # Use the base cardinal graph for the integer part + base_integer_graph = cardinal.graph + # Only special-case 10000 -> 만 for decimal integer part (if needed) + specials_input = pynini.cross("10000", "만") + + # Try the special mapping first, then fall back to normal cardinal + cardinal_before_decimal = (specials_input | base_integer_graph).optimize() + + cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv")) + zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + + DOUBLE_QUOTE = '"' + + graph_integer = ( + pynutil.insert(f'integer_part: {DOUBLE_QUOTE}') + cardinal_before_decimal + pynutil.insert(DOUBLE_QUOTE) + ) + graph_fractional = ( + pynutil.insert(f'fractional_part: {DOUBLE_QUOTE}') + + pynini.closure(cardinal_after_decimal | zero, 1) + + pynutil.insert(DOUBLE_QUOTE) + ) + + # Decimal without a sign (e.g., 2.5) + graph_decimal_no_sign = graph_integer + pynutil.delete('.') + pynutil.insert(NEMO_SPACE) + graph_fractional + + # Negative sign handling (e.g., -2.5 or 마이너스2.5) + graph_with_negative = ( + pynutil.insert(f'negative: {DOUBLE_QUOTE}') + + (pynini.cross("-", "마이너스") | pynini.accep("마이너스")) + + pynutil.insert(DOUBLE_QUOTE) + ) + + graph_decimal = graph_decimal_no_sign | ( + graph_with_negative + pynutil.insert(NEMO_SPACE) + graph_decimal_no_sign + ) + + # For internal use without tokens + self.just_decimal = graph_decimal_no_sign.optimize() + + # Final graph with tokens + final_graph = self.add_tokens(graph_decimal) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/electronic.py b/nemo_text_processing/text_normalization/ko/taggers/electronic.py new file mode 100644 index 000000000..3f257c958 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/electronic.py @@ -0,0 +1,176 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_ALPHA, + NEMO_DIGIT, + NEMO_NOT_SPACE, + NEMO_SIGMA, + NEMO_SPACE, + GraphFst, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class ElectronicFst(GraphFst): + """ + Finite state transducer (FST) for classifying **electronic expressions** such as + email addresses, URLs, and domain names in Korean. + + Example conversions: + - abc@nvidia.co.kr → electronic { username: "abc" domain: "nvidia.co.kr" } + - www.nvidia.com → electronic { domain: "www.nvidia.com" } + - https://nvidia.com → electronic { protocol: "HTTPS colon slash slash" domain: "nvidia.com" } + - 1234-5678-9012-3456 → electronic { protocol: "credit card" domain: "1234567890123456" } + + Args: + cardinal: FST for digit/number verbalization (used for numeric parts if non-deterministic). + deterministic: If True, provides a single transduction path; otherwise allows multiple. + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="electronic", kind="classify", deterministic=deterministic) + + # ---------- Basic character ranges and symbols ---------- + LOWER = pynini.union(*[pynini.accep(c) for c in "abcdefghijklmnopqrstuvwxyz"]) + UPPER = pynini.union(*[pynini.accep(c) for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]) + ASCII_LETTER = (LOWER | UPPER).optimize() + ASCII_ALNUM = (ASCII_LETTER | NEMO_DIGIT).optimize() + + HYPHEN = pynini.accep("-") + DOT = pynini.accep(".") + SLASH = pynini.accep("/") + AT = pynini.accep("@") + + # Handle numeric reading mode (only for non-deterministic mode) + numbers = ( + NEMO_DIGIT + if deterministic + else (pynutil.insert(NEMO_SPACE) + cardinal.long_numbers + pynutil.insert(NEMO_SPACE)) + ) + + # ---------- Load resources ---------- + cc_cues = pynini.string_file(get_abs_path("data/electronic/cc_cues.tsv")) + accepted_symbols = pynini.project(pynini.string_file(get_abs_path("data/electronic/symbol.tsv")), "input") + accepted_common_domains = pynini.project( + pynini.string_file(get_abs_path("data/electronic/domain.tsv")), "input" + ) + graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize() + + # ---------- Username ---------- + # Exclude '@' from username + username_symbols = pynini.difference(accepted_symbols, AT) + # Start with alphanumeric and allow symbols/numbers repeatedly + username_core = ASCII_ALNUM + pynini.closure(ASCII_ALNUM | numbers | username_symbols) + username = pynutil.insert('username: "') + username_core + pynutil.insert('"') + pynini.cross("@", NEMO_SPACE) + + # ---------- Domain ---------- + # Simplified RFC: label = [A-Za-z0-9-]+ , TLD = '.' [A-Za-z0-9]{2,} + label = pynini.closure(ASCII_ALNUM | HYPHEN, 1) + tld = DOT + pynini.closure(ASCII_ALNUM, 2) + # Domain can be (label + TLD) or TLD only (e.g., ".com") + domain_core = (label + pynini.closure(tld, 1)) | tld + + # Optional path after domain (e.g., /path) + path_segment = pynini.closure(NEMO_NOT_SPACE, 1) # at least one non-space character + path = SLASH + path_segment # / + optional_path = pynini.closure(path, 0, 1) # optional path + + domain_with_opt_path = domain_core + optional_path + + domain_graph_with_class_tags = ( + pynutil.insert('domain: "') + domain_with_opt_path.optimize() + pynutil.insert('"') + ) + + # ---------- protocol ---------- + protocol_symbols = pynini.closure((graph_symbols | pynini.cross(":", "colon")) + pynutil.insert(NEMO_SPACE)) + protocol_start = (pynini.cross("https", "HTTPS ") | pynini.cross("http", "HTTP ")) + ( + pynini.accep("://") @ protocol_symbols + ) + protocol_file_start = pynini.accep("file") + insert_space + (pynini.accep(":///") @ protocol_symbols) + protocol_end = pynutil.add_weight(pynini.cross("www", "WWW ") + pynini.accep(".") @ protocol_symbols, -1000) + protocol = protocol_file_start | protocol_start | protocol_end | (protocol_start + protocol_end) + protocol = pynutil.insert('protocol: "') + protocol + pynutil.insert('"') + + # ---------- Combine all graphs ---------- + graph = pynini.Fst() # empty + + # (1) Email pattern + email_guard = NEMO_SIGMA + AT + NEMO_SIGMA + DOT + NEMO_SIGMA + graph |= pynini.compose(email_guard, username + domain_graph_with_class_tags) + + # (2) Domain only (without protocol) + # Exclude '$' (conflict with money FST) and '@' (email) + dollar_accep = pynini.accep("$") + excluded_symbols = DOT | dollar_accep | AT + filtered_symbols = pynini.difference(accepted_symbols, excluded_symbols) + # Domain core graph + graph_domain = (pynutil.insert('domain: "') + domain_core + pynutil.insert('"')).optimize() + graph |= graph_domain + + known_extensions = pynini.project( + pynini.string_file(get_abs_path("data/electronic/extensions.tsv")), + "input", + ) + + filename_stem = pynini.closure( + pynini.difference(NEMO_NOT_SPACE, pynini.union(SLASH, DOT)), + 1, + ) + + file_with_extension = filename_stem + known_extensions + + graph |= (pynutil.insert('domain: "') + file_with_extension + pynutil.insert('"')).optimize() + + # (3) URL with protocol + graph |= protocol + insert_space + domain_graph_with_class_tags + + # (4) Credit card pattern: cue + 4–16 digits + if deterministic: + cc_digits = pynini.closure(NEMO_DIGIT, 4, 16) + cc_phrases = ( + pynutil.insert('protocol: "') + + cc_cues + + pynutil.insert('" domain: "') + + delete_space + + cc_digits + + pynutil.insert('"') + ) + graph |= cc_phrases + + four = pynini.closure(NEMO_DIGIT, 4, 4) + sep_token = pynini.union(HYPHEN, NEMO_SPACE) + cc16_grouped = (four + pynini.cross(sep_token, " ")) ** 3 + four + cc16_grouped = cc16_grouped + delete_space + + cc16_no_cue = ( + pynutil.insert('protocol: "신용카드 " ') + + pynutil.insert('domain: "') + + cc16_grouped + + pynutil.insert('"') + ) + + # Give it higher priority over Date FST + cc16_no_cue = pynutil.add_weight(cc16_no_cue.optimize(), -1.0) + + graph |= cc16_no_cue + + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/fraction.py b/nemo_text_processing/text_normalization/ko/taggers/fraction.py new file mode 100644 index 000000000..4e30ef1c6 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/fraction.py @@ -0,0 +1,97 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class FractionFst(GraphFst): + """ + Finite state transducer for classifying Korean fractions, e.g. + 3/5 → tokens { fraction { numerator: "삼" denominator: "오" } } + 2과7/9 → tokens { fraction { integer_part: "이" numerator: "칠" denominator: "구" } } + 마이너스3/5 → tokens { fraction { negative: "마이너스" numerator: "삼" denominator: "오" } } + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="fraction", kind="classify", deterministic=deterministic) + + cardinal = cardinal.graph + graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + + DOUBLE_QUOTE = '"' + slash = pynutil.delete('/') + morphemes = pynini.accep('분의') + root = pynini.accep('√') + + # Decimal number (e.g., 1.23 → 일점이삼) + decimal_number = cardinal + pynini.cross(".", "점") + pynini.closure(graph_digit | graph_zero) + + # Accept cardinal / root + cardinal / decimal / root + decimal + numeral = cardinal | (root + cardinal) | decimal_number | (root + decimal_number) + + # Integer part (e.g., 2과, 1와) + integer_component = ( + pynutil.insert(f'integer_part: {DOUBLE_QUOTE}') + + numeral + + (pynini.accep("과") | pynini.accep("와")) + + pynutil.insert(DOUBLE_QUOTE) + ) + + integer_component_with_space = integer_component + delete_space + pynutil.insert(NEMO_SPACE) + + # Denominator and numerator + denominator_component = pynutil.insert(f'denominator: {DOUBLE_QUOTE}') + numeral + pynutil.insert(DOUBLE_QUOTE) + + numerator_component = pynutil.insert(f'numerator: {DOUBLE_QUOTE}') + numeral + pynutil.insert(DOUBLE_QUOTE) + + # Format 1: 3/4 style + graph_fraction_slash = ( + pynini.closure(integer_component_with_space, 0, 1) + + numerator_component + + slash + + pynutil.insert(NEMO_SPACE) + + denominator_component + + pynutil.insert(NEMO_SPACE) + + pynutil.insert('morphosyntactic_features: "분의"') + ) + + # Format 2: Korean native "4분의3" style + graph_fraction_word = ( + pynini.closure(integer_component_with_space, 0, 1) + + denominator_component + + pynutil.delete("분의") + + pynutil.insert(NEMO_SPACE) + + pynutil.insert('morphosyntactic_features: "분의"') + + pynutil.insert(NEMO_SPACE) + + numerator_component + ) + + # Optional minus sign + optional_sign = ( + pynutil.insert(f'negative: {DOUBLE_QUOTE}') + + (pynini.accep("마이너스") | pynini.cross("-", "마이너스")) + + pynutil.insert(DOUBLE_QUOTE) + + pynutil.insert(NEMO_SPACE) + ) + + # Combine full graph + graph = pynini.closure(optional_sign, 0, 1) + (graph_fraction_slash | graph_fraction_word) + self.graph = graph.optimize() + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/measure.py b/nemo_text_processing/text_normalization/ko/taggers/measure.py new file mode 100644 index 000000000..0891e5783 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/measure.py @@ -0,0 +1,106 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying Korean measure expressions. + - 1kg → measure { cardinal { integer: "일" } units: "킬로그램" } + - 12.5km → measure { decimal { integer_part: "십이" fractional_part: "오" } units: "킬로미터" } + - 2/3m → measure { fraction { numerator: "이" denominator: "삼" } units: "미터" } + - 60km/h → measure { cardinal { integer: "육십" } units: "킬로미터 퍼 시간" } + + This FST attaches measurement units (e.g., "킬로미터", "그램") to numeric expressions + classified by the `cardinal`, `decimal`, or `fraction` subgraphs. + + Args: + cardinal: FST handling integer (cardinal) numbers. + decimal: FST handling decimal numbers (optional). + fraction: FST handling fractional numbers (optional). + deterministic: If True, provides a single transduction path; otherwise allows multiple. + """ + + def __init__( + self, + cardinal: GraphFst, + decimal: GraphFst = None, + fraction: GraphFst = None, + deterministic: bool = True, + ): + super().__init__(name="measure", kind="classify", deterministic=deterministic) + + # Numeric subgraphs + graph_cardinal = cardinal.graph + + # Unit lexicon + graph_unit = pynini.string_file(get_abs_path("data/measure/unit.tsv")) + + # Per-expression handling (e.g., km/h, m/s) + opt_space = pynini.closure(delete_space, 0, 1) + per = pynini.cross("/", "퍼") + opt_space + insert_space + graph_unit + optional_per = pynini.closure(opt_space + insert_space + per, 0, 1) + + # Final unit FST produces either "" or "" + unit = pynutil.insert('units: "') + (graph_unit + optional_per | per) + pynutil.insert('"') + + minus_as_field = pynutil.insert('negative: "마이너스" ') + consume_minus = pynini.cross("-", "") | pynini.cross("마이너스", "") + + # Optional minus field + removal of actual sign symbol or word + optional_minus = pynini.closure(minus_as_field + consume_minus + opt_space, 0, 1) + + # Combine numeric and unit components + pieces = [] + + # 1) Cardinal form: e.g., "12kg" + sub_cardinal = ( + pynutil.insert("cardinal { ") + + pynutil.insert('integer: "') + + graph_cardinal + + delete_space + + pynutil.insert('" } ') + + unit + ) + pieces.append(sub_cardinal) + + # 2) Decimal form: e.g., "12.5km" + if decimal is not None: + sub_decimal = ( + pynutil.insert("decimal { ") + + optional_minus + + decimal.just_decimal + + delete_space + + pynutil.insert(" } ") + + unit + ) + pieces.append(sub_decimal) + + # 3) Fraction form: e.g., "2/3m" or "삼분의 이 미터" + if fraction is not None: + sub_fraction = pynutil.insert("fraction { ") + fraction.graph + delete_space + pynutil.insert(" } ") + unit + pieces.append(sub_fraction) + + # Union all supported numeric forms (cardinal | decimal | fraction) + graph = pieces[0] + for p in pieces[1:]: + graph |= p + + # Final wrapping into tokens { measure { ... } } + self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/money.py b/nemo_text_processing/text_normalization/ko/taggers/money.py new file mode 100644 index 000000000..d4320aa7a --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/money.py @@ -0,0 +1,99 @@ +# Copyright (c) 2025 NVIDIA CORPORATION.  All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +#     http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.ko.utils import get_abs_path, load_labels + + +class MoneyFst(GraphFst): + """ + Finite state transducer for classifying Korean money. + + Example inputs and outputs: + ₩350 -> money { currency_maj: "원" integer_part: "삼백오십" } + 350원 -> money { integer_part: "삼백오십" currency_maj: "원" } + KRW 12,050 -> money { currency_maj: "원" integer_part: "일만이천오십" } + 12만 500원 -> money { integer_part: "십이만오백" currency_maj: "원" } + ₩10.25 -> money { currency_maj: "원" integer_part: "십" minor_part: "이십오" } # optional 2-digit minor + 0원 -> money { integer_part: "영" currency_maj: "원" } + + Args: + cardinal: CardinalFst + deterministic: If True, provide a single transduction; + if False, allow multiple transductions. + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="money", kind="classify", deterministic=deterministic) + + graph_cardinal = cardinal.graph + sp = pynini.closure(delete_space) # absorb any amount of spaces in input + + # --- Numbers (integer / optional minor) --- + # Integer part: "0" or a non-zero leading digit; allow commas (e.g., 18,925,000) + integer_part_fst = pynini.union("0", (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT | pynutil.delete(","))) + + # Plain integer → integer_part: "" + graph_integer_plain = ( + pynutil.insert('integer_part: "') + (integer_part_fst @ graph_cardinal) + pynutil.insert('" ') + ) + + # Optional 2-digit decimal (kept as minor_part if ever used downstream) + decimal_part_fst = NEMO_DIGIT**2 + graph_minor = pynutil.insert('minor_part: "') + (decimal_part_fst @ graph_cardinal) + pynutil.insert('" ') + + # Integer with scale suffix (만/억/조) → wrap the whole thing in one integer_part + scale_unit = pynini.union("만", "억", "조") + value_with_scale = (integer_part_fst @ graph_cardinal) + scale_unit + graph_integer_with_suffix = ( + pynutil.insert('integer_part: "') + value_with_scale + pynutil.insert('" ') + ).optimize() + + # Integer (+ optional ".<2-digit>" minor) + number_component_plain = graph_integer_plain + pynini.closure(pynutil.delete(".") + graph_minor, 0, 1) + number_component = (graph_integer_with_suffix | number_component_plain).optimize() + + # --- Currency (prefix or suffix) --- + # currency_major.tsv example: + # ₩ 원 + # KRW 원 + # 원 원 + maj_labels = load_labels(get_abs_path("data/money/currency_major.tsv")) + + # Prefix currency (e.g., ₩, KRW): emit currency_maj then number + currency_major_prepended = pynini.union( + *[pynutil.delete(surface) + pynutil.insert(f'currency_maj: "{unit}" ') for surface, unit in maj_labels] + ).optimize() + + # Suffix currency (e.g., ...원, ...달러): convert unit literal to currency_maj + currency_major_appended = pynini.union( + *[pynutil.delete(unit) + pynutil.insert(f'currency_maj: "{unit}" ') for _, unit in maj_labels] + ).optimize() + + # --- Compose (NO period handling) --- + # NOTE: We deliberately do NOT consume '/월', '/년', '/주', '/일', '/시간' here. + # If present in the raw text, they remain outside the money token and can be handled upstream/elsewhere. + + # [currency] [number] + graph_prepend = (currency_major_prepended + sp + number_component).optimize() + + # [number] [currency] + graph_append = (number_component + currency_major_appended).optimize() + + graph = (graph_prepend | graph_append).optimize() + + self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py new file mode 100644 index 000000000..59fa30ada --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying Korean ordinal expressions, e.g. + 1번째 -> ordinal { integer: "첫번째" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="ordinal", kind="classify", deterministic=deterministic) + + # Load base .tsv files + graph_digit = pynini.string_file(get_abs_path("data/ordinal/digit.tsv")) + graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) + graph_exceptions = pynini.string_file(get_abs_path("data/ordinal/exceptions.tsv")) + graph_tens = pynini.string_file(get_abs_path("data/ordinal/tens.tsv")) + graph_tens_prefix = pynini.string_file(get_abs_path("data/ordinal/tens_prefix.tsv")) + + graph_11_to_39 = (graph_tens_prefix + graph_digit).optimize() + + # Combine all ordinal forms from 1 to 39 + graph_ordinal_1to39 = ( + graph_exceptions | graph_digit | graph_zero | graph_tens | graph_11_to_39 + ).optimize() + pynini.accep("번째") + + # Accept tens digit 4–9 + tens_digit_4_to_9_accep = pynini.union(*[pynini.accep(str(i)) for i in range(4, 10)]) + # Accept any single digit + any_single_digit_accep = pynini.union(*[pynini.accep(str(i)) for i in range(0, 10)]) + # Combine two digits + from_40_to_99_inputs = tens_digit_4_to_9_accep + any_single_digit_accep + + # Match numbers with 3 or more digits + input_100_plus = pynini.closure(any_single_digit_accep, 3) + + # Combine both ranges (40–99 and 100+): total range = 40 and above + filter_inputs_from_40 = (from_40_to_99_inputs | input_100_plus).optimize() + + # Only allow cardinal numbers that are 40 or more + graph_cardinal_from40_filtered = pynini.compose(filter_inputs_from_40, cardinal.graph) + + # Add "번째" to the filtered cardinal graph. + graph_ordinal_from40 = graph_cardinal_from40_filtered + pynini.accep("번째") + + graph_ordinal = (graph_ordinal_1to39 | graph_ordinal_from40).optimize() # Handles 1-39 # Handles 40+ + + # Single-character particles (가, 이, 은, 는, 로, 도 ...) + josa_single = pynini.union("가", "이", "은", "는", "를", "을", "로", "도", "다") + + # Multi-character particles (부터, 까지) + josa_multi = pynini.union("부터", "까지") + + # Allow patterns like: + # 번째 + (optional single-josa) + (optional multi-josa) + josa = (josa_single.ques + josa_multi.ques).optimize() + + # Final ordinal graph with optional particles + graph_ordinal_with_josa = (graph_ordinal + josa).optimize() + + # Build the “integer: …” token structure + final_graph = pynutil.insert('integer: "') + graph_ordinal_with_josa + pynutil.insert('"') + + self.fst = self.add_tokens(final_graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/punctuation.py b/nemo_text_processing/text_normalization/ko/taggers/punctuation.py new file mode 100644 index 000000000..a10250a99 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/punctuation.py @@ -0,0 +1,37 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst + + +class PunctuationFst(GraphFst): + """ + Finite state transducer for classifying punctuation + e.g. a, -> tokens { name: "a" } tokens { name: "," } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="punctuation", kind="classify", deterministic=deterministic) + + range_component = pynini.cross("〜", "부터") | pynini.accep("부터") + + graph = pynutil.insert('name: "') + range_component + pynutil.insert('"') + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/telephone.py b/nemo_text_processing/text_normalization/ko/taggers/telephone.py new file mode 100644 index 000000000..f4d88b445 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/telephone.py @@ -0,0 +1,89 @@ +# Copyright (c) 2025 NVIDIA CORPORATION.  All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +#     http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SIGMA, GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying Korean telephone numbers. + + Example inputs → tokens: + +82 010-3713-7050 -> telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" } + +1 (415) 555-0123 -> telephone { country_code: "국가번호 일," number_part: "사일오 오오오 영일이삼" } + (031)371-3700 -> telephone { number_part: "영삼일 삼칠일 삼칠영영" } + 010-3713-7050 -> telephone { number_part: "영일영 삼칠일삼 칠영오영" } + 010.777.8888 -> telephone { number_part: "영일영 칠칠칠 팔팔팔팔" } + + Args: + deterministic (bool, optional): If True, provide a single transduction; + if False, allow multiple transductions. + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="telephone", kind="classify", deterministic=deterministic) + # Separator between digit blocks (e.g., "-" or ".") + delete_sep = pynutil.delete("-") | pynutil.delete(".") + # Optional space inserted between blocks + insert_block_space = insert_space + + # 1) safe digit mapping: force 0 -> "영" (do not rely on zero.tsv invert) + digit = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize() + zero_map = pynini.cross("0", "영") + digit_ko = (digit | zero_map).optimize() + + three_digits = digit_ko**3 + four_digits = digit_ko**4 + + # country code: "+1", "+82", "+1-" + cc_digits = pynini.closure(digit_ko, 1, 3) + + country_code = ( + pynutil.delete("+") + + pynutil.insert('country_code: "') + + cc_digits + + pynutil.insert('"') + + pynini.closure(pynutil.delete("-") | pynutil.delete(" "), 0, 1) + + delete_space + ) + + # area part: "123-" | "123." | "(123)" [space?] or "(123)-" + area_core = three_digits + area_part = ( + (area_core + delete_sep) + | ( + pynutil.delete("(") + + area_core + + pynutil.delete(")") + + pynini.closure(pynutil.delete(" "), 0, 1) + + pynini.closure(delete_sep, 0, 1) + ) + ) + insert_block_space + + # 2) allow 3 **or 4** digits in the middle block (to support 010-3713-7050) + mid = pynini.union(three_digits, four_digits) + last4 = four_digits + + # consume '-' or '.' between middle and last blocks + number_part_core = area_part + mid + delete_sep + insert_block_space + last4 + number_part = pynutil.insert('number_part: "') + number_part_core + pynutil.insert('"') + + # final graph: with or without country code + graph = pynini.union(country_code + insert_space + number_part, number_part).optimize() + + self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/time.py b/nemo_text_processing/text_normalization/ko/taggers/time.py new file mode 100644 index 000000000..b8a499823 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/time.py @@ -0,0 +1,192 @@ +# Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time, e.g. + 오전 10시 30분 -> time { suffix: "오전" hours: "열시" minutes: "삼십분" } + 오후 3시 반 -> time { suffix: "오후" hours: "세시" minutes: "삼십분" } + 자정 -> time { hours: "영시" } + 정오 -> time { hours: "열두시" } + + Args: + cardinal: CardinalFst (Korean cardinal graph) + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="time", kind="classify", deterministic=deterministic) + + # Base number-to-words graph from the Cardinal Fst + graph_cardinal = cardinal.graph + strip0 = pynini.closure(pynutil.delete("0"), 0, 1) + + SP = pynini.closure(delete_space) + SEP = SP + insert_space + hour_clock = pynini.string_file(get_abs_path("data/time/hour.tsv")) + division = pynini.string_file(get_abs_path("data/time/division.tsv")) + + division_component = pynutil.insert("suffix: \"") + division + pynutil.insert("\"") + + # --- Special single-word times --- + noon = pynini.accep("정오") + midnight = pynini.accep("자정") + noon_component = pynutil.insert("hours: \"") + pynini.cross(noon, "열두시") + pynutil.insert("\"") + midnight_component = pynutil.insert("hours: \"") + pynini.cross(midnight, "영시") + pynutil.insert("\"") + + # --- Korean Hangul components (H시 [M분|반] [S초]) --- + # "0" or "00" -> 0 + h_zero = strip0 + pynini.accep("0") + # "13".."24" + h_13_24 = pynini.union(*[str(i) for i in range(13, 25)]) + + # "0시" -> "영시" + hour_component_ko_zero = ( + pynutil.insert("hours: \"") + + pynini.cross(h_zero, "영") + + pynutil.delete("시") + + pynutil.insert("시") + + pynutil.insert("\"") + ) + + # "13시..24시" -> Sino-Korean words (십삼/…/이십사) + 시 + hour_component_ko_13_24 = ( + pynutil.insert("hours: \"") + + (h_13_24 @ graph_cardinal) + + pynutil.delete("시") + + pynutil.insert("시") + + pynutil.insert("\"") + ) + + # "1시..12시" -> Native Korean words (한/두/세/네/…/열두) + 시 + hour_component_ko_1_12 = ( + pynutil.insert("hours: \"") + + (strip0 + hour_clock) + + pynutil.delete("시") + + pynutil.insert("시") + + pynutil.insert("\"") + ) + + # Priority: 13-24 > 0 > 1-12 to prevent partial matching errors + hour_component_ko = (hour_component_ko_13_24 | hour_component_ko_zero | hour_component_ko_1_12).optimize() + + # Minutes: number+"분" or "반" (approx. 30분). Allows optional '쯤|경' after minutes/반. + about_word = pynini.union("쯤", "경") + minute_number = ( + pynutil.insert("minutes: \"") + + (strip0 + graph_cardinal) + + pynutil.delete("분") + + pynutil.insert("분") + + pynutil.insert("\"") + ) + minute_half = ( + pynutil.insert("minutes: \"") + + pynutil.delete("반") + + pynutil.insert("반") + + pynini.closure(about_word, 0, 1) + + pynutil.insert("\"") + ) + minute_component_ko = (minute_half | minute_number).optimize() + + second_component_ko = ( + pynutil.insert("seconds: \"") + + (strip0 + graph_cardinal) + + pynutil.delete("초") + + pynutil.insert("초") + + pynutil.insert("\"") + ) + + # Allow suffix before or after + suffix_prefix_opt = pynini.closure(division_component + SEP, 0, 1) + suffix_postfix_opt = pynini.closure(SEP + division_component, 0, 1) + + # Hangul patterns + graph_hangul = ( + suffix_prefix_opt + + ( + hour_component_ko + | (hour_component_ko + SEP + minute_component_ko) + | (hour_component_ko + SEP + minute_component_ko + SEP + second_component_ko) + | minute_component_ko + | (minute_component_ko + SEP + second_component_ko) + | second_component_ko + ) + + suffix_postfix_opt + ).optimize() + + # Special words with optional suffix + graph_special = (suffix_prefix_opt + (noon_component | midnight_component) + suffix_postfix_opt).optimize() + + # --- Clock patterns: HH:MM[:SS] --- + colon = pynutil.delete(":") + + # Map 1-12 hours using native-Korean words, allowing an optional leading zero. + graph_hour_1_12 = ( + pynutil.insert("hours: \"") + (strip0 + hour_clock) + pynutil.insert("시") + pynutil.insert("\"") + ).optimize() + + # 0, 00, and 13-24 -> Sino-Korean words + hour_sino_val = ( + pynini.cross("00", "0") + | pynini.cross("0", "0") + | pynini.union(*[pynini.cross(str(i), str(i)) for i in range(13, 25)]) + ) + hour_sino_read = hour_sino_val @ graph_cardinal + + graph_hour_others = pynutil.insert("hours: \"") + hour_sino_read + pynutil.insert("시") + pynutil.insert("\"") + + hour_clock_component = (graph_hour_1_12 | graph_hour_others).optimize() + + minute_clock_component = ( + pynutil.insert("minutes: \"") + strip0 + graph_cardinal + pynutil.insert("분") + pynutil.insert("\"") + ) + second_clock_component = ( + pynutil.insert("seconds: \"") + strip0 + graph_cardinal + pynutil.insert("초") + pynutil.insert("\"") + ) + + # HH:MM (drop minutes if "00") + graph_hm_clock = ( + suffix_prefix_opt + + hour_clock_component + + delete_space.ques + + colon + + delete_space.ques + + (pynini.cross("00", "") | pynini.closure(insert_space + minute_clock_component, 0, 1)) + + suffix_postfix_opt + ).optimize() + + # HH:MM:SS (drop minutes/seconds if "00") + graph_hms_clock = ( + suffix_prefix_opt + + hour_clock_component + + delete_space.ques + + colon + + delete_space.ques + + (pynini.cross("00", "") | pynini.closure(insert_space + minute_clock_component, 0, 1)) + + delete_space.ques + + colon + + delete_space.ques + + (pynini.cross("00", "") | pynini.closure(insert_space + second_clock_component, 0, 1)) + + suffix_postfix_opt + ).optimize() + + graph = (graph_special | graph_hangul | graph_hm_clock | graph_hms_clock).optimize() + graph_final = self.add_tokens(graph) + self.fst = graph_final.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..fba6302a2 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py @@ -0,0 +1,115 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.ko.taggers.date import DateFst +from nemo_text_processing.text_normalization.ko.taggers.decimal import DecimalFst +from nemo_text_processing.text_normalization.ko.taggers.electronic import ElectronicFst +from nemo_text_processing.text_normalization.ko.taggers.fraction import FractionFst +from nemo_text_processing.text_normalization.ko.taggers.measure import MeasureFst +from nemo_text_processing.text_normalization.ko.taggers.money import MoneyFst +from nemo_text_processing.text_normalization.ko.taggers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.ko.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.ko.taggers.telephone import TelephoneFst +from nemo_text_processing.text_normalization.ko.taggers.time import TimeFst +from nemo_text_processing.text_normalization.ko.taggers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.ko.taggers.word import WordFst +from nemo_text_processing.utils.logging import logger + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + input_case: str = "cased", + deterministic: bool = True, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_tokenize.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logger.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + cardinal = CardinalFst(deterministic=deterministic) + date = DateFst(cardinal=cardinal, deterministic=deterministic) + time = TimeFst(cardinal=cardinal, deterministic=deterministic) + ordinal = OrdinalFst(cardinal=cardinal, deterministic=deterministic) + word = WordFst(deterministic=deterministic) + decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) + fraction = FractionFst(cardinal=cardinal, deterministic=deterministic) + whitelist = WhiteListFst(deterministic=deterministic) + punctuation = PunctuationFst(deterministic=deterministic) + money = MoneyFst(cardinal=cardinal, deterministic=deterministic) + telephone = TelephoneFst(deterministic=deterministic) + measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) + electronic = ElectronicFst(cardinal=cardinal, deterministic=deterministic) + + classify = pynini.union( + pynutil.add_weight(cardinal.fst, 1.1), + pynutil.add_weight(date.fst, 1.1), + pynutil.add_weight(time.fst, 1.1), + pynutil.add_weight(fraction.fst, 1.0), + pynutil.add_weight(ordinal.fst, 1.1), + pynutil.add_weight(decimal.fst, 1.05), + pynutil.add_weight(word.fst, 100), + pynutil.add_weight(money.fst, 1.1), + pynutil.add_weight(measure.fst, 1.1), + pynutil.add_weight(punctuation.fst, 1.0), + pynutil.add_weight(whitelist.fst, 1.1), + pynutil.add_weight(telephone.fst, 1.1), + pynutil.add_weight(electronic.fst, 1.11), + ) + + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + + graph = ( + delete_space + token + pynini.closure((delete_extra_space | pynini.accep("")) + token) + delete_space + ) + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/text_normalization/ko/taggers/whitelist.py b/nemo_text_processing/text_normalization/ko/taggers/whitelist.py new file mode 100644 index 000000000..8977d8c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/whitelist.py @@ -0,0 +1,29 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class WhiteListFst(GraphFst): + def __init__(self, deterministic: bool = True): + super().__init__(name="whitelist", kind="classify", deterministic=deterministic) + + whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")) + graph = pynutil.insert('name: "') + whitelist + pynutil.insert('"') + + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/word.py b/nemo_text_processing/text_normalization/ko/taggers/word.py new file mode 100644 index 000000000..7aa3db709 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/taggers/word.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_DIGIT, NEMO_NOT_SPACE, GraphFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying Korean word. + e.g. 이름 -> tokens { name: "이름" } + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="word", kind="classify", deterministic=deterministic) + + word_char = pynini.difference(NEMO_NOT_SPACE, NEMO_DIGIT) + + word = pynutil.insert('name: "') + word += pynini.closure(word_char, 1) + word += pynutil.insert('"') + + self.fst = word.optimize() diff --git a/nemo_text_processing/text_normalization/ko/utils.py b/nemo_text_processing/text_normalization/ko/utils.py new file mode 100644 index 000000000..51aaea3e8 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/utils.py @@ -0,0 +1,60 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + +def load_labels(abs_path): + """ + loads relative path file as dictionary + + Args: + abs_path: absolute path + + Returns dictionary of mappings + """ + with open(abs_path, encoding="utf-8") as label_tsv: + labels = list(csv.reader(label_tsv, delimiter="\t")) + return labels + + +def augment_labels_with_punct_at_end(labels): + """ + augments labels: if key ends on a punctuation that value does not have, add a new label + where the value maintains the punctuation + + Args: + labels : input labels + Returns: + additional labels + """ + res = [] + for label in labels: + if len(label) > 1: + if label[0][-1] == "." and label[1][-1] != ".": + res.append([label[0], label[1] + "."] + label[2:]) + return res diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py new file mode 100644 index 000000000..c6a48ab33 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/cardinal.py @@ -0,0 +1,48 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal, e.g. + cardinal { negative: "true" integer: "23" } -> 마이너스 이십삼 + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) + + self.optional_sign = pynini.cross("negative: \"true\"", "마이너스 ") + if not deterministic: + self.optional_sign |= pynini.cross("negative: \"true\"", "음수 ") + self.optional_sign |= pynini.cross("negative: \"true\"", "- ") + + self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1) + + integer = pynini.closure(NEMO_NOT_QUOTE) + + self.integer = delete_space + pynutil.delete("\"") + integer + pynutil.delete("\"") + integer = pynutil.delete("integer:") + self.integer + + self.numbers = self.optional_sign + integer + delete_tokens = self.delete_tokens(self.numbers) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/date.py b/nemo_text_processing/text_normalization/ko/verbalizers/date.py new file mode 100644 index 000000000..bfd5e9aa1 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/date.py @@ -0,0 +1,55 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + + +class DateFst(GraphFst): + """ + Korean date verbalizer + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="date", kind="verbalize", deterministic=deterministic) + + era_component = pynutil.delete("era: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + year_component = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + month_component = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + day_component = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + week_component = ( + pynutil.delete("weekday: \"") + + pynini.closure(delete_space) + + pynini.closure(NEMO_NOT_QUOTE) + + pynini.closure(delete_space) + + pynutil.delete("\"") + ) + + SPACE = pynini.closure(delete_space, 0, 1) + insert_space + + # This graph now correctly uses the 'delete_space' variable defined above. + graph_basic_date = ( + pynini.closure(era_component + SPACE, 0, 1) + + pynini.closure(year_component + SPACE, 0, 1) + + pynini.closure(month_component + SPACE, 0, 1) + + pynini.closure(day_component, 0, 1) + + pynini.closure(SPACE + week_component, 0, 1) + ) | (month_component + SPACE + week_component) + + final_graph = graph_basic_date + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/decimal.py b/nemo_text_processing/text_normalization/ko/verbalizers/decimal.py new file mode 100644 index 000000000..54375c5a9 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/decimal.py @@ -0,0 +1,48 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class DecimalFst(GraphFst): + def __init__(self, deterministic: bool = True): + super().__init__(name="decimal", kind="verbalize", deterministic=deterministic) + + # Extract integer part + integer_part = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + + # Extract fractional part and prepend "점" + fractional_part = ( + pynutil.delete('fractional_part: "') + + pynutil.insert("점") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + # Verbalize decimal number without sign + decimal_positive = integer_part + pynutil.delete(" ") + fractional_part + + # Handle negative sign + negative_sign = ( + pynutil.delete('negative: "') + pynini.accep("마이너스") + pynutil.delete('"') + pynutil.delete(" ") + ) + + # Combine positive and negative cases + decimal = decimal_positive | (negative_sign + pynutil.insert(" ") + decimal_positive) + + delete_tokens = self.delete_tokens(decimal) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py new file mode 100644 index 000000000..c880e432d --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py @@ -0,0 +1,157 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.examples import plurals +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_ALPHA, + NEMO_CHAR, + NEMO_DIGIT, + NEMO_NOT_QUOTE, + NEMO_SIGMA, + NEMO_SPACE, + GraphFst, + delete_extra_space, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.ko.utils import get_abs_path + + +class ElectronicFst(GraphFst): + """ + Finite state transducer (FST) for verbalizing **electronic expressions** (email/URL/domain). + + Input tokens: + tokens { electronic { username: "abc" domain: "abc.com" } } + + Example output (policy-dependent): + abc 골뱅이 abc 닷컴 + + Args: + deterministic: If True, produce a single verbalization. + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) + + # 1) Handle digits (0–9) + graph_digit_no_zero = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize() + + graph_zero = pynini.cross("0", "영") + if not deterministic: + graph_zero |= pynini.cross("0", "공") + graph_digit = (graph_digit_no_zero | graph_zero).optimize() + + digit_inline_rewrite = pynini.cdrewrite( + graph_digit, + "", + "", + NEMO_SIGMA, + ) + + # 2) Load electronic symbols (ex: "." → "점") + graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbol.tsv")).optimize() + + NEMO_NOT_BRACKET = pynini.difference(NEMO_CHAR, pynini.union("{", "}")).optimize() + + # 3) Default spacing for characters, symbols, and digits + default_chars_symbols = pynini.cdrewrite( + pynutil.insert(" ") + (graph_symbols | graph_digit | NEMO_ALPHA) + pynutil.insert(" "), + "", + "", + NEMO_SIGMA, + ) + default_chars_map = pynini.compose(pynini.closure(NEMO_NOT_BRACKET), default_chars_symbols).optimize() + + # 4) username part (add spaces between characters) + raw_username = pynini.closure(NEMO_NOT_QUOTE, 1) + + user_name = ( + pynutil.delete("username:") + + delete_space + + pynutil.delete('"') + + (raw_username @ digit_inline_rewrite) + + pynutil.delete('"') + ) + + # 5) domain part (handle common endings like .com → 닷컴) + domain_common_pairs = ( + pynini.string_file(get_abs_path("data/electronic/domain.tsv")) + | pynini.string_file(get_abs_path("data/electronic/extensions.tsv")) + ).optimize() + + # Rewrite known domains (.com → 닷컴) + tld_rewrite = pynini.cdrewrite( + domain_common_pairs, + "", + "", + NEMO_SIGMA, + ) + # Add a space before “닷” if needed + add_space_before_dot = pynini.cdrewrite( + pynini.cross("닷", " 닷"), + (NEMO_ALPHA | NEMO_DIGIT | NEMO_CHAR), + "", + NEMO_SIGMA, + ) + + raw_domain = pynini.closure(NEMO_NOT_QUOTE, 1) + + four = pynini.closure(NEMO_DIGIT, 4, 4) + cc16_grouped = four + pynutil.insert(" ") + four + pynutil.insert(" ") + four + pynutil.insert(" ") + four + cc_domain = (cc16_grouped @ digit_inline_rewrite).optimize() + + domain = ( + pynutil.delete("domain:") + + delete_space + + pynutil.delete('"') + + ((raw_domain @ digit_inline_rewrite) @ tld_rewrite @ add_space_before_dot) + + delete_space + + pynutil.delete('"') + ).optimize() + + # 6) protocol (like “https://” or “file:///”) + protocol = ( + pynutil.delete('protocol: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + insert_space + ) + + protocol_raw = pynutil.delete('protocol: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + cc_protocol_guard = pynini.accep("신용카드") + pynini.closure(NEMO_NOT_QUOTE, 0) + cc_protocol = (protocol_raw @ cc_protocol_guard) + insert_space + + # Credit card case: "신용카드 ..." protocol + 16-digit domain grouped as 4-4-4-4 + cc_graph = ( + cc_protocol + + delete_space + + pynutil.delete("domain:") + + delete_space + + pynutil.delete('"') + + cc_domain + + pynutil.delete('"') + + delete_space + ).optimize() + + # 7) Combine: optional protocol + optional username + domain + default_graph = ( + pynini.closure(protocol + delete_space, 0, 1) + + pynini.closure(user_name + delete_space + pynutil.insert(" 골뱅이 ") + delete_space, 0, 1) + + domain + + delete_space + ).optimize() + + graph = (cc_graph | default_graph) @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA) + self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py new file mode 100644 index 000000000..bafbf133d --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py @@ -0,0 +1,90 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space + + +class FractionFst(GraphFst): + """ + Finite state transducer for verbalizing Korean fractions, e.g. + tokens { fraction { numerator: "3" denominator: "5" } } → 5분의3 + tokens { fraction { integer_part: "2" numerator: "7" denominator: "9" } } → 2과 9분의7 + tokens { fraction { denominator: "√8" numerator: "4" } } → 루트8분의4 + tokens { fraction { denominator: "2.75" numerator: "125" } } → 2.75분의125 + tokens { fraction { negative: "마이너스" numerator: "10" denominator: "11" } } → 마이너스11분의10 + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) + + # Handles square root symbols like "√3" → "루트3" + denominator_root = pynini.cross("√", "루트") + pynutil.insert(NEMO_SPACE) + pynini.closure(NEMO_NOT_QUOTE) + numerator_root = pynini.cross("√", "루트") + pynutil.insert(NEMO_SPACE) + pynini.closure(NEMO_NOT_QUOTE) + + # Matches non-root numeric content + denominator = pynini.closure(NEMO_NOT_QUOTE - "√") + numerator = pynini.closure(NEMO_NOT_QUOTE - "√") + + # Delete FST field: denominator and extract value + denominator_component = ( + pynutil.delete('denominator: "') + (denominator_root | denominator) + pynutil.delete('"') + ) + numerator_component = pynutil.delete('numerator: "') + (numerator_root | numerator) + pynutil.delete('"') + + # Match fraction form: "denominator + 분의 + numerator" + # Also deletes optional morphosyntactic_features: "분의" if present + graph_fraction = ( + denominator_component + + pynutil.delete(NEMO_SPACE) + + pynini.closure( + pynutil.delete('morphosyntactic_features:') + delete_space + pynutil.delete('"분의"') + delete_space, + 0, + 1, + ) + + pynutil.insert("분의") + + pynutil.insert(NEMO_SPACE) + + numerator_component + ) + + # Match and delete integer_part field (e.g., "2" in "2과3분의1") + graph_integer = ( + pynutil.delete('integer_part:') + + delete_space + + pynutil.delete('"') + + pynini.closure(pynini.union("√", ".", NEMO_NOT_QUOTE - '"')) + + pynutil.delete('"') + + pynutil.insert(NEMO_SPACE) + ) + graph_integer_fraction = graph_integer + delete_space + graph_fraction + + # Match and delete optional negative field (e.g., "마이너스") + optional_sign = ( + pynutil.delete('negative:') + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE - '"') + + pynutil.delete('"') + + delete_space + + pynutil.insert(NEMO_SPACE) + ) + + # Final graph handles optional negative + (integer + fraction | fraction only) + graph = pynini.closure(optional_sign, 0, 1) + (graph_integer_fraction | graph_fraction) + + # Final optimized verbalizer FST + final_graph = self.delete_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/measure.py b/nemo_text_processing/text_normalization/ko/verbalizers/measure.py new file mode 100644 index 000000000..765b143cd --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/measure.py @@ -0,0 +1,89 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_NOT_QUOTE, + NEMO_SIGMA, + GraphFst, + delete_space, + insert_space, +) + + +class MeasureFst(GraphFst): + """ + Finite state transducer for verbalizing Korean measure tokens into surface text. + measure { cardinal { integer: "<...>" } units: "<...>" } + + Converts tokens like: + measure { cardinal { integer: "이" } units: "킬로그램" } + measure { fraction { numerator: "이" denominator: "삼" } units: "킬로미터" } + + into surface text: + "이 킬로그램", "삼분의 이 킬로미터" + + Args: + decimal, cardinal, fraction: Sub-verbalizers handling number types. + deterministic: If True, outputs a single normalized form. + """ + + def __init__( + self, + decimal: GraphFst = None, + cardinal: GraphFst = None, + fraction: GraphFst = None, + deterministic: bool = True, + ): + super().__init__(name="measure", kind="verbalize", deterministic=deterministic) + + # Combine all numeric verbalizers + graph_cardinal = cardinal.fst + graph_decimal = decimal.fst + graph_fraction = fraction.fst + + # Add a space after "마이너스" if it appears within numeric blocks + minus_space_rewrite = pynini.cdrewrite(pynini.cross("마이너스", "마이너스 "), "", "", NEMO_SIGMA).optimize() + + # Apply rewrite to each numeric subgraph to ensure spacing after "마이너스" + cardinal_spaced = graph_cardinal @ minus_space_rewrite + fraction_spaced = graph_fraction @ minus_space_rewrite + decimal_spaced = graph_decimal @ minus_space_rewrite + + # Combine all supported numeric types (cardinal | decimal | fraction) + number_block = decimal_spaced | cardinal_spaced | fraction_spaced + + # Extract and output unit string + units = ( + delete_space + + pynutil.delete("units:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + # Normal form: + main = number_block + insert_space + units + + # preserve_order form: + preserve_order = delete_space + pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + alt = units + insert_space + number_block + pynini.closure(preserve_order) + + graph = main | alt + + # Merge and clean tokens + self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/money.py b/nemo_text_processing/text_normalization/ko/verbalizers/money.py new file mode 100644 index 000000000..333f68011 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/money.py @@ -0,0 +1,110 @@ +# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +#     http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + +# ===== whitespace & token helpers ===== +sp = pynini.closure(delete_space) # absorb 0+ spaces +FIELD_VAL = pynini.closure(NEMO_NOT_QUOTE, 1) + + +def del_key_val(key: str): + """ + Delete the token field prefix and quotes, keep only the value. + + Input format: [sp] key: "" + Output: + + Example: + input 'integer_part: "삼백오십"' + output '삼백오십' + """ + return (sp + pynutil.delete(f'{key}: "') + FIELD_VAL + pynutil.delete('"')).optimize() + + +def drop_key_val(key: str): + """ + Delete the entire key-value pair (key and its quoted value). + + Input format: [sp] key: "" + Output: (nothing) + + Example: + input 'minor_part: "십"' + output '' + """ + return (sp + pynutil.delete(f'{key}: "') + FIELD_VAL + pynutil.delete('"')).optimize() + + +def drop_key_exact(key: str, val: str): + """ + Delete the exact key-value pair if it matches the given value. + + Input format: [sp] key: "val" + Output: (nothing) + + Example: + input 'currency_maj: "원"' + output '' + """ + return (sp + pynutil.delete(f'{key}: "{val}"')).optimize() + + +class MoneyFst(GraphFst): + """ + Verbalize Korean money. + + Input tokens: + tokens { money { integer_part: "..." currency_maj: "..." [minor_part: "..."] } } + + Period (e.g., /월, /년, …) is intentionally NOT handled here. + Output examples: + integer_part: "십" currency_maj: "원" -> "십원" + integer_part: "삼십억" currency_maj: "원" -> "삼십억원" + integer_part: "이백" currency_maj: "달러" -> "이백 달러" + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="money", kind="verbalize", deterministic=deterministic) + + # --- fields --- + integer_part = del_key_val("integer_part") + minor_part_drop = drop_key_val("minor_part") # ignore minor for KRW + currency_val_any = del_key_val("currency_maj") # ex) "원", "달러", "유로" + won_key_drop = drop_key_exact("currency_maj", "원") # don't print the key for KRW + + # ===== KRW (원) ===== + # (A) [integer] [원] -> "{integer}원" + won_a = integer_part + sp + won_key_drop + pynutil.insert("원") + # (B) [원] [integer] -> "{integer}원" + won_b = won_key_drop + sp + integer_part + pynutil.insert("원") + won_core = won_a | won_b + won_core = (won_core + pynini.closure(minor_part_drop, 0, 1)).optimize() + + # ===== Other currencies ===== + # "{integer} {currency}" (KRW sticks; others are spaced) + other_core = (integer_part + insert_space + currency_val_any).optimize() + other_core = (other_core + pynini.closure(minor_part_drop, 0, 1)).optimize() + + # ===== combine (no period) ===== + graph_core = (pynutil.add_weight(won_core, 0.0) | pynutil.add_weight(other_core, 0.5)).optimize() + + # no trailing period mapping + graph = graph_core + + # strip tokens wrapper + self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/ko/verbalizers/ordinal.py new file mode 100644 index 000000000..c8c06a0c4 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/ordinal.py @@ -0,0 +1,42 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for verbalizing korean ordinal e.g. + tokens { ordinal { integer: "1번째" } } -> 첫번째 + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) + + graph_integer = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.delete("\"") + ) + + final_graph = graph_integer + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py b/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py new file mode 100644 index 000000000..211358141 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py @@ -0,0 +1,83 @@ +# Copyright (c) 2025 NVIDIA CORPORATION.  All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +#     http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for verbalizing Korean telephone numbers. + + Input: + telephone { [country_code: "...",] number_part: "..." [extension: "..."] } + Output: + [country_code + " "] + number_part [+ ", 내선 " + extension] + + Examples: + telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" } + -> 플러스 팔 이, 영일영, 삼칠일삼, 칠영오영 + telephone { number_part: "팔영영 오오오 영영영영" extension: "이삼사" } + -> 팔영영, 오오오, 영영영영, 내선 이삼사 + + Args: + deterministic: if True provides a single transduction; if False allows multiple. + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="telephone", kind="verbalize", deterministic=deterministic) + + # country_code (optional, add trailing space if present) + country = ( + pynini.closure(delete_space, 0, 1) + + pynutil.delete('country_code: "') + + pynutil.insert("국가번호 ") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + + insert_space + ) + + # number_part (mandatory) + number = ( + pynini.closure(delete_space, 0, 1) + + pynutil.delete('number_part: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + # extension (optional, prepend with ", 내선 ") + ext_field = ( + pynini.closure(delete_space, 0, 1) + + pynutil.delete('extension: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + extension_opt = pynini.closure(pynutil.insert(", 내선 ") + ext_field, 0, 1) + + # remove wrapper "telephone { ... }" + graph = ( + pynutil.delete("telephone") + + pynini.closure(delete_space, 0, 1) + + pynutil.delete("{") + + pynini.closure(country, 0, 1) + + number + + extension_opt + + pynini.closure(delete_space, 0, 1) + + pynutil.delete("}") + ) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/time.py b/nemo_text_processing/text_normalization/ko/verbalizers/time.py new file mode 100644 index 000000000..4d1414b1e --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/time.py @@ -0,0 +1,63 @@ +# Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + + +class TimeFst(GraphFst): + """ + Finite state transducer for verbalizing time e.g. + + + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="time", kind="verbalize", deterministic=deterministic) + + hour_component = pynutil.delete("hours: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + minute_content = pynini.closure(NEMO_NOT_QUOTE) + minute_component = pynutil.delete("minutes: \"") + pynini.cross("영분", "") + pynutil.delete( + "\"" + ) | pynutil.delete("minutes: \"") + (minute_content - "영분") + pynutil.delete("\"") + + second_content = pynini.closure(NEMO_NOT_QUOTE) + second_component = pynutil.delete("seconds: \"") + pynini.cross("영초", "") + pynutil.delete( + "\"" + ) | pynutil.delete("seconds: \"") + (second_content - "영초") + pynutil.delete("\"") + + division_component = pynutil.delete("suffix: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + + graph_basic_time = pynini.closure(division_component + delete_space + insert_space, 0, 1) + ( + ( + hour_component + + delete_space + + insert_space + + minute_component + + delete_space + + insert_space + + second_component + ) + | (hour_component + delete_space + insert_space + minute_component) + | hour_component + | minute_component + | second_component + ) + + final_graph = graph_basic_time + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py new file mode 100644 index 000000000..d3d5b951a --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py @@ -0,0 +1,74 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini + +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.text_normalization.ko.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.ko.verbalizers.date import DateFst +from nemo_text_processing.text_normalization.ko.verbalizers.decimal import DecimalFst +from nemo_text_processing.text_normalization.ko.verbalizers.electronic import ElectronicFst +from nemo_text_processing.text_normalization.ko.verbalizers.fraction import FractionFst +from nemo_text_processing.text_normalization.ko.verbalizers.measure import MeasureFst +from nemo_text_processing.text_normalization.ko.verbalizers.money import MoneyFst +from nemo_text_processing.text_normalization.ko.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.ko.verbalizers.telephone import TelephoneFst +from nemo_text_processing.text_normalization.ko.verbalizers.time import TimeFst +from nemo_text_processing.text_normalization.ko.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.ko.verbalizers.word import WordFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic) + + cardinal = CardinalFst(deterministic=deterministic) + date = DateFst(deterministic=deterministic) + ordinal = OrdinalFst(deterministic=deterministic) + decimal = DecimalFst(deterministic=deterministic) + word = WordFst(deterministic=deterministic) + fraction = FractionFst(deterministic=deterministic) + whitelist = WhiteListFst(deterministic=deterministic) + time = TimeFst(deterministic=deterministic) + money = MoneyFst(deterministic=deterministic) + telephone = TelephoneFst(deterministic=deterministic) + measure = MeasureFst(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic) + electronic = ElectronicFst(deterministic=deterministic) + + graph = pynini.union( + cardinal.fst, + ordinal.fst, + word.fst, + decimal.fst, + fraction.fst, + date.fst, + whitelist.fst, + time.fst, + money.fst, + telephone.fst, + measure.fst, + electronic.fst, + ) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py new file mode 100644 index 000000000..3ec44eac6 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py @@ -0,0 +1,74 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_SIGMA, + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.utils.logging import logger + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "twelve" minutes: "thirty" } } tokens { name: "now" } tokens { name: "." } -> its twelve thirty now . + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_verbalizer.far") + + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + logger.info(f'VerbalizeFinalFst graph was restored from {far_file}.') + else: + token_graph = VerbalizeFst(deterministic=deterministic) + + token_verbalizer = ( + pynutil.delete("tokens {") + delete_space + token_graph.fst + delete_space + pynutil.delete(" }") + ) + + space_between_tokens = pynini.closure(NEMO_WHITE_SPACE, 1) + + verbalizer = ( + delete_space + + token_verbalizer + + pynini.closure(space_between_tokens + token_verbalizer) + + delete_space + ) + + self.fst = verbalizer.optimize() + + if far_file: + generator_main(far_file, {"verbalize": self.fst}) diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/ko/verbalizers/whitelist.py new file mode 100644 index 000000000..786b3afbf --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/whitelist.py @@ -0,0 +1,39 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + + +class WhiteListFst(GraphFst): + """ + tokens { name: "부터" } -> 부터 + tokens { name: "~" } -> ~ + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="whitelist", kind="verbalize", deterministic=deterministic) + graph = ( + pynutil.delete("name:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete("\"") + ) + + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/word.py b/nemo_text_processing/text_normalization/ko/verbalizers/word.py new file mode 100644 index 000000000..a14abd553 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/word.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class WordFst(GraphFst): + """ + Korean verbalizer for word. + tokens { name: "이름" } -> 이름 + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="word", kind="verbalize", deterministic=deterministic) + + graph = pynutil.delete("name: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"") + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 82f8f43d2..1a9219574 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -174,6 +174,9 @@ def __init__( elif lang == 'ja': from nemo_text_processing.text_normalization.ja.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst + elif lang == 'ko': + from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst else: raise NotImplementedError(f"Language {lang} has not been supported yet.") @@ -720,7 +723,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi"], + choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi", "ko"], default="en", type=str, ) diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py index 0438579a7..fc9b21c29 100644 --- a/nemo_text_processing/text_normalization/run_evaluate.py +++ b/nemo_text_processing/text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi'], + choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi', 'ko'], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/__init__.py b/tests/nemo_text_processing/ko/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/tests/nemo_text_processing/ko/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..40187f74e --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,48 @@ +1~일 +2~이 +3~삼 +123~백이십삼 +13000~만삼천 +9000~구천 +123000~십이만삼천 +123000012~일억이천삼백만십이 +1000000~백만 +100000000~일억 +1000000000000~일조 +100000000000000~백조 +20000000000001~이십조일 +800000000001001~팔백조천일 +82345670123135111~팔경이천삼백사십오조육천칠백일억이천삼백십삼만오천백십일 +9999999999999~구조구천구백구십구억구천구백구십구만구천구백구십구 +99999999999999~구십구조구천구백구십구억구천구백구십구만구천구백구십구 +999999999999999~구백구십구조구천구백구십구억구천구백구십구만구천구백구십구 +9999999999999999~구천구백구십구조구천구백구십구억구천구백구십구만구천구백구십구 +19~십구 +76~칠십육 +379~삼백칠십구 +850~팔백오십 +1004~천사 +8326~팔천삼백이십육 +10383~만삼백팔십삼 +34892~삼만사천팔백구십이 +573234~오십칠만삼천이백삼십사 +982010~구십팔만이천십 +2349023~이백삼십사만구천이십삼 +4303189~사백삼십만삼천백팔십구 +60321589~육천삼십이만천오백팔십구 +88234568~팔천팔백이십삼만사천오백육십팔 +792133923~칠억구천이백십삼만삼천구백이십삼 +187624689~일억팔천칠백육십이만사천육백팔십구 +2304050708~이십삼억사백오만칠백팔 +6436789729~육십사억삼천육백칠십팔만구천칠백이십구 +78234580257~칠백팔십이억삼천사백오십팔만이백오십칠 +987654321345~구천팔백칠십육억오천사백삼십이만천삼백사십오 +2345678901234~이조삼천사백오십육억칠천팔백구십만천이백삼십사 +35791357913579~삼십오조칠천구백십삼억오천칠백구십일만삼천오백칠십구 +470369258147036~사백칠십조삼천육백구십이억오천팔백십사만칠천삼십육 +5048258149517395~오천사십팔조이천오백팔십일억사천구백오십일만칠천삼백구십오 +67890123045607890~육경칠천팔백구십조천이백삼십억사천오백육십만칠천팔백구십 +-2~마이너스 이 +-93~마이너스 구십삼 +-90325~마이너스 구만삼백이십오 +-3234567~마이너스 삼백이십삼만사천오백육십칠 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..c3e81a25c --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_date.txt @@ -0,0 +1,29 @@ +2024년~이천이십사년 +3월~삼월 +15일~십오일 +2024/01/30~이천이십사년 일월 삼십일 +10/30/2024~이천이십사년 시월 삼십일 +29/12/2000~이천년 십이월 이십구일 +2024/3/20~이천이십사년 삼월 이십일 +2024-07-23~이천이십사년 칠월 이십삼일 +1999.9.17~천구백구십구년 구월 십칠일 +기원전128년~기원전 백이십팔년 +기원후1390년~기원후 천삼백구십년 +28일수요일~이십팔일 수요일 +1900년대~천구백년대 +1월1일(월)〜3일(수)~일월 일일 월요일 부터 삼일 수요일 +5월10일(금)〜15일(수)~오월 십일 금요일 부터 십오일 수요일 +8월20일〜25일~팔월 이십일 부터 이십오일 +12월30일(토)〜1월2일(화)~십이월 삼십일 토요일 부터 일월 이일 화요일 +2월28일(목)〜3월3일(일)~이월 이십팔일 목요일 부터 삼월 삼일 일요일 +6월1일〜5일~유월 일일 부터 오일 +10월8일(화)〜10일(목)~시월 팔일 화요일 부터 십일 목요일 +1970〜1980년대~천구백칠십 부터 천구백팔십년대 +80〜90년대~팔십 부터 구십년대 +2010〜2020년대~이천십 부터 이천이십년대 +7월5〜9일(월〜금)~칠월 오 부터 구일 월요일부터금요일 +3월10〜15일(화〜일)~삼월 십 부터 십오일 화요일부터일요일 +11월1〜5일(수〜일)~십일월 일 부터 오일 수요일부터일요일 +2023년3월1일(수)〜6월12일(화)~이천이십삼년 삼월 일일 수요일 부터 유월 십이일 화요일 +2024년1월15일(월)〜2월10일(토)~이천이십사년 일월 십오일 월요일 부터 이월 십일 토요일 +2025년12월20일(토)〜2026년1월5일(월)~이천이십오년 십이월 이십일 토요일 부터 이천이십육년 일월 오일 월요일 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..d363c5bb2 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_decimal.txt @@ -0,0 +1,28 @@ +-0.1~마이너스 영점일 +-0.5~마이너스 영점오 +-1.1~마이너스 일점일 +-2.5~마이너스 이점오 +-4.2~마이너스 사점이 +-11.99~마이너스 십일점구구 +-15.8~마이너스 십오점팔 +-25.3~마이너스 이십오점삼 +-30.8~마이너스 삼십점팔 +-72.4~마이너스 칠십이점사 +-100.5~마이너스 백점오 +0.1~영점일 +0.5~영점오 +1.1~일점일 +2.5~이점오 +4.2~사점이 +11.99~십일점구구 +15.8~십오점팔 +25.3~이십오점삼 +30.8~삼십점팔 +42.75~사십이점칠오 +72.4~칠십이점사 +100.5~백점오 +123.99~백이십삼점구구 +165.4~백육십오점사 +999.99~구백구십구점구구 +1000.01~천점영일 +123456.2234~십이만삼천사백오십육점이이삼사 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt new file mode 100644 index 000000000..4e09d0db2 --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt @@ -0,0 +1,20 @@ +a@hotmail.de~a 골뱅이 hotmail 닷 디이 +a@hotmail.fr~a 골뱅이 hotmail 닷 에프알 +a@hotmail.it~a 골뱅이 hotmail 닷 아이티 +a@aol.it~a 골뱅이 aol 닷 아이티 +a@msn.it~a 골뱅이 msn 닷 아이티 +abc@nvidia.app~abc 골뱅이 nvidia 닷 앱 +user01@gmail.co.kr~user영일 골뱅이 gmail 닷 씨오 닷 케이알 +nvidia.co.kr~nvidia 닷 씨오 닷 케이알 +1234-5678-9012-3456~신용카드 일이삼사 오육칠팔 구영일이 삼사오육 +2345-2222-3333-4444~신용카드 이삼사오 이이이이 삼삼삼삼 사사사사 +9090-1234-5555-9876~신용카드 구영구영 일이삼사 오오오오 구팔칠육 +카드 마지막 네자리 3456~카드 마지막 네자리 삼사오육 +카드 마지막 4자리 7890~카드 마지막 네자리 칠팔구영 +카드 끝자리 3456~카드 끝자리 삼사오육 +사진.jpg~사진 닷 제이피지 +사진.JPG~사진 닷 제이피지 +사진.png~사진 닷 피엔지 +사진.PNG~사진 닷 피엔지 +문서.pdf~문서 닷 피디에프 +문서.PDF~문서 닷 피디에프 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt new file mode 100644 index 000000000..a183be59b --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt @@ -0,0 +1,14 @@ +1/2~이분의 일 +-1/2~마이너스 이분의 일 +1과1/2~일과 이분의 일 +2와12/33~이와 삼십삼분의 십이 +-1과1/2~마이너스 일과 이분의 일 +마이너스1과1/2~마이너스 일과 이분의 일 +마이너스1과√1/2~마이너스 일과 이분의 루트 일 +-1과√1/2~마이너스 일과 이분의 루트 일 +1과√1/2~일과 이분의 루트 일 +1과1/√3~일과 루트 삼분의 일 +1과1/3~일과 삼분의 일 +1과√1/4~일과 사분의 루트 일 +3분의1~삼분의 일 +121분의3221~백이십일분의 삼천이백이십일 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..3d24c4f0d --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_measure.txt @@ -0,0 +1,40 @@ +1kg~일 킬로그램 +12kg~십이 킬로그램 +0kg~영 킬로그램 +100g~백 그램 +500g~오백 그램 +1m~일 미터 +12km~십이 킬로미터 +5cm~오 센티미터 +15mm~십오 밀리미터 +1000km~천 킬로미터 +9999m~구천구백구십구 미터 +10L~십 리터 +1l~일 리터 +250ml~이백오십 밀리리터 +1mL~일 밀리리터 +123h~백이십삼 시간 +3s~삼 초 +60km/h~육십 킬로미터 퍼 시간 +1m/s~일 미터 퍼 초 +12kg/kg~십이 킬로그램 퍼 킬로그램 +1km/m~일 킬로미터 퍼 미터 +50W~오십 와트 +440Hz~사백사십 헤르츠 +300N~삼백 뉴턴 +120rpm~백이십 분당회전수 +100%~백 퍼센트 +30°~삼십 도 +0.5kg~영점오 킬로그램 +2.3km~이점삼 킬로미터 +12.5L~십이점오 리터 +3.14m~삼점일사 미터 +0.03m~영점영삼 미터 +1/2kg~이분의 일 킬로그램 +2/3km~삼분의 이 킬로미터 +5/8cm~팔분의 오 센티미터 +2과3/4L~이과 사분의 삼 리터 +10과1/2km/h~십과 이분의 일 킬로미터 퍼 시간 +-3/4km~마이너스 사분의 삼 킬로미터 +-3.1km~마이너스 삼점일 킬로미터 +-3km~마이너스 삼 킬로미터 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..7a22075ab --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_money.txt @@ -0,0 +1,64 @@ +₩2000~이천원 +₩2,000~이천원 +2000원~이천원 +KRW 1230000~백이십삼만원 +₩460000000000~사천육백억원 +₩30억~삼십억원 +₩30조~삼십조원 +₩45억~사십오억원 +₩15000~만오천원 +₩1~일원 +₩20~이십원 +₩18925000~천팔백구십이만오천원 +₩18,925,000~천팔백구십이만오천원 +₩18854~만팔천팔백오십사원 +₩18129~만팔천백이십구원 +₩0~영원 +₩7~칠원 +₩10~십원 +₩11~십일원 +₩21~이십일원 +₩99~구십구원 +200원~이백원 +999원~구백구십구원 +₩1,000~천원 +₩9,999~구천구백구십구원 +₩10,000~만원 +₩20,000~이만원 +₩100,000~십만원 +₩1,000,000~백만원 +₩2,500,000~이백오십만원 +₩12,345~만이천삼백사십오원 +₩1,234,567~백이십삼만사천오백육십칠원 +₩10,000,000~천만원 +₩23,456,000~이천삼백사십오만육천원 +₩100,000,000~일억원 +₩123,000,000~일억이천삼백만원 +₩2억~이억원 +₩12억~십이억원 +₩100억~백억원 +₩1,000억~천억원 +₩3,400억~삼천사백억원 +₩4조~사조원 +₩12조~십이조원 +KRW 1,000,000,000~십억원 +krw 2,345,600,000~이십삼억사천오백육십만원 +₩2,300,000,000~이십삼억원 +₩999,999,999~구억구천구백구십구만구천구백구십구원 +KRW 30,000,000~삼천만원 +krw 5000~오천원 +$ 0~영 달러 +$ 1~일 달러 +$ 200~이백 달러 +US$ 1,234,567~백이십삼만사천오백육십칠 달러 +$ 30억~삼십억 달러 +HK$ 300~삼백 홍콩 달러 +€500~오백 유로 +EUR 1,230,000~백이십삼만 유로 +¥2000~이천 엔 +JPY 1,230,000~백이십삼만 엔 +¥30조~삼십조 엔 +CAD 2,500~이천오백 캐나다 달러 +NZD 123,456~십이만삼천사백오십육 뉴질랜드 달러 +CHF 100~백 스위스 프랑 +AED 75~칠십오 아랍에미리트 디르함 diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..3544a2aeb --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,54 @@ +1번째~첫번째 +2번째~두번째 +3번째~세번째 +4번째~네번째 +5번째~다섯번째 +6번째~여섯번째 +7번째~일곱번째 +8번째~여덟번째 +9번째~아홉번째 +10번째~열번째 +11번째~열한번째 +12번째~열두번째 +13번째~열세번째 +14번째~열네번째 +15번째~열다섯번째 +16번째~열여섯번째 +17번째~열일곱번째 +18번째~열여덟번째 +19번째~열아홉번째 +20번째~스무번째 +21번째~스물한번째 +22번째~스물두번째 +23번째~스물세번째 +24번째~스물네번째 +25번째~스물다섯번째 +26번째~스물여섯번째 +27번째~스물일곱번째 +28번째~스물여덟번째 +29번째~스물아홉번째 +30번째~서른번째 +31번째~서른한번째 +32번째~서른두번째 +33번째~서른세번째 +34번째~서른네번째 +35번째~서른다섯번째 +36번째~서른여섯번째 +37번째~서른일곱번째 +38번째~서른여덟번째 +39번째~서른아홉번째 +100번째~백번째 +189번째~백팔십구번째 +1034번째~천삼십사번째 +7324번째~칠천삼백이십사번째 +23456번째~이만삼천사백오십육번째 +78903번째~칠만팔천구백삼번째 +345678번째~삼십사만오천육백칠십팔번째 +987654번째~구십팔만칠천육백오십사번째 +1000000번째~백만번째 +5678901번째~오백육십칠만팔천구백일번째 +89123456번째~팔천구백십이만삼천사백오십육번째 +62345098번째~육천이백삼십사만오천구십팔번째 +235067092번째~이억삼천오백육만칠천구십이번째 +876543210번째~팔억칠천육백오십사만삼천이백십번째 +1000000000번째~십억번째 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..b6e573aec --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt @@ -0,0 +1,33 @@ ++1 123-123-5678~국가번호 일 일이삼 일이삼 오육칠팔 +123-123-5678~일이삼 일이삼 오육칠팔 ++1-123-123-5678~국가번호 일 일이삼 일이삼 오육칠팔 ++1 (123)-123-5678~국가번호 일 일이삼 일이삼 오육칠팔 +(123)-123-5678~일이삼 일이삼 오육칠팔 +555.555.5555~오오오 오오오 오오오오 +(123) 123-5678~일이삼 일이삼 오육칠팔 +010-3713-7050~영일영 삼칠일삼 칠영오영 ++82 123-456-7890~국가번호 팔이 일이삼 사오육 칠팔구영 ++82-123-4567-8901~국가번호 팔이 일이삼 사오육칠 팔구영일 ++44-207-555-1234~국가번호 사사 이영칠 오오오 일이삼사 +123.456-7890~일이삼 사오육 칠팔구영 +123-456.7890~일이삼 사오육 칠팔구영 +(987)-654-3210~구팔칠 육오사 삼이일영 +(987) 654-3210~구팔칠 육오사 삼이일영 ++7 000-000-0000~국가번호 칠 영영영 영영영 영영영영 +000.000.0000~영영영 영영영 영영영영 +271-828-1828~이칠일 팔이팔 일팔이팔 +314-159-2653~삼일사 일오구 이육오삼 +(010) 123-4567~영일영 일이삼 사오육칠 ++358-123-456-7890~국가번호 삼오팔 일이삼 사오육 칠팔구영 ++1 800-555-0000~국가번호 일 팔영영 오오오 영영영영 +(800) 555-0000~팔영영 오오오 영영영영 ++12 345-678-9012~국가번호 일이 삼사오 육칠팔 구영일이 ++999 999-999-9999~국가번호 구구구 구구구 구구구 구구구구 +321.654.0987~삼이일 육오사 영구팔칠 ++82 010-1234-5678~국가번호 팔이 영일영 일이삼사 오육칠팔 +(999)-000-0000~구구구 영영영 영영영영 ++1-123.456.7890~국가번호 일 일이삼 사오육 칠팔구영 ++82-123.456-7890~국가번호 팔이 일이삼 사오육 칠팔구영 +111-222-3333~일일일 이이이 삼삼삼삼 +909-808-7070~구영구 팔영팔 칠영칠영 +(555)555-5555~오오오 오오오 오오오오 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..2fb79402d --- /dev/null +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_time.txt @@ -0,0 +1,25 @@ +오전10시30분~오전 열시 삼십분 +오전 10시30분~오전 열시 삼십분 +오전 10:30~오전 열시 삼십분 +오전 10:30:05~오전 열시 삼십분 오초 +오후 3시 반~오후 세시 반 +오후3시30분~오후 세시 삼십분 +오후 03:30~오후 세시 삼십분 +새벽 4시 5분~새벽 네시 오분 +새벽 04:05~새벽 네시 오분 +아침 7시~아침 일곱시 +낮 12시 15분~낮 열두시 십오분 +저녁 8시 45분~저녁 여덟시 사십오분 +밤 11시 55분 5초~밤 열한시 오십오분 오초 +밤 11:50:05~밤 열한시 오십분 오초 +정오~열두시 +자정~영시 +14:05~십사시 오분 +18:05~십팔시 오분 +23:00~이십삼시 +00:30:00~영시 삼십분 +24:03:38~이십사시 삼분 삼십팔초 +오전 0시 15분~오전 영시 십오분 +오후 12시 10분~오후 열두시 십분 +아침7시1분~아침 일곱시 일분 +저녁9시09분~저녁 아홉시 구분 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py new file mode 100644 index 000000000..763b7e607 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -0,0 +1,33 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import parse_test_case_file + + +class TestCardinal: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds diff --git a/tests/nemo_text_processing/ko/test_date.py b/tests/nemo_text_processing/ko/test_date.py new file mode 100644 index 000000000..5fe6966c9 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_date.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer +from ..utils import parse_test_case_file + + +class TestDate: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_date(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds diff --git a/tests/nemo_text_processing/ko/test_decimal.py b/tests/nemo_text_processing/ko/test_decimal.py new file mode 100644 index 000000000..84b820d3f --- /dev/null +++ b/tests/nemo_text_processing/ko/test_decimal.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer +from ..utils import parse_test_case_file + + +class TestDecimal: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_decimal(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds diff --git a/tests/nemo_text_processing/ko/test_electronic.py b/tests/nemo_text_processing/ko/test_electronic.py new file mode 100644 index 000000000..d06099328 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_electronic.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer +from ..utils import parse_test_case_file + + +class TestElectronic: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_electronic.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_electronic(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds diff --git a/tests/nemo_text_processing/ko/test_fraction.py b/tests/nemo_text_processing/ko/test_fraction.py new file mode 100644 index 000000000..605e6ad7d --- /dev/null +++ b/tests/nemo_text_processing/ko/test_fraction.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer +from ..utils import parse_test_case_file + + +class TestFraction: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_fraction.txt')) + # @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_fraction(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds diff --git a/tests/nemo_text_processing/ko/test_measure.py b/tests/nemo_text_processing/ko/test_measure.py new file mode 100644 index 000000000..5a788e71f --- /dev/null +++ b/tests/nemo_text_processing/ko/test_measure.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer +from ..utils import parse_test_case_file + + +class TestMeasure: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_measure(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds diff --git a/tests/nemo_text_processing/ko/test_money.py b/tests/nemo_text_processing/ko/test_money.py new file mode 100644 index 000000000..b5dbd80a5 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_money.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer +from ..utils import parse_test_case_file + + +class TestMoney: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_money(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds diff --git a/tests/nemo_text_processing/ko/test_ordinal.py b/tests/nemo_text_processing/ko/test_ordinal.py new file mode 100644 index 000000000..19a337bc7 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_ordinal.py @@ -0,0 +1,33 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import parse_test_case_file + + +class TestOrdinal: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm_date(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input, verbose=True) + assert expected == preds diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh new file mode 100644 index 000000000..0fff6159a --- /dev/null +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh @@ -0,0 +1,83 @@ +#! /bin/sh +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read written spoken <<< $testcase + # replace non breaking space with breaking space + # Use below if postprocessor is not used. Comment if it is used + #denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + # Use below if postprocessor is used. Comment if it is not used + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + # trim white space + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$written" "$spoken" "$denorm_pred" + done < "$input" +} + + +testTNCardinal() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testTNOrdinalText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_ordinal.txt + runtest $input +} + +testTNDecimalalText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_decimal.txt + runtest $input +} + +testTNFractionText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_fraction.txt + runtest $input +} + +testTNDateText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_date.txt + runtest $input +} + +testTNTimeText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_time.txt + runtest $input +} + +testTNMoneyText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_money.txt + runtest $input +} + +testTNTelephoneText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_telephone.txt + runtest $input +} + +testTNMeasureText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_measure.txt + runtest $input +} + +testTNElectronicText() { + input=$TEST_DIR/ko/data_text_normalization/test_cases_electronic.txt + runtest $input +} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_telephone.py b/tests/nemo_text_processing/ko/test_telephone.py new file mode 100644 index 000000000..1875f72cd --- /dev/null +++ b/tests/nemo_text_processing/ko/test_telephone.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer +from ..utils import parse_test_case_file + + +class TestTime: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_telephone(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds diff --git a/tests/nemo_text_processing/ko/test_time.py b/tests/nemo_text_processing/ko/test_time.py new file mode 100644 index 000000000..3f71a94db --- /dev/null +++ b/tests/nemo_text_processing/ko/test_time.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer +from ..utils import parse_test_case_file + + +class TestTime: + normalizer_ko = Normalizer( + lang='ko', cache_dir='export/ko_tn_grammars_lower_cased', overwrite_cache=False, input_case='lower_cased' + ) + + @parameterized.expand(parse_test_case_file('ko/data_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_time(self, test_input, expected): + preds = self.normalizer_ko.normalize(test_input) + assert expected == preds diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 6b82dfbec..765714883 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,6 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', + 'ko', ], type=str, default='en', @@ -312,6 +313,11 @@ def parse_args(): ClassifyFst as TNClassifyFst, ) from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'ko': + from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ( + ClassifyFst as TNClassifyFst, + ) + from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir,