From 71581d0daea88a822f8ea52be42e4282b14a8cec Mon Sep 17 00:00:00 2001 From: Mai Anh Date: Fri, 13 Mar 2026 18:19:02 +0700 Subject: [PATCH 1/4] Add Portuguese (PT) text normalization: cardinal, ordinal, decimal, fraction Signed-off-by: Mai Anh --- .../text_normalization/normalize.py | 3 + .../text_normalization/pt/__init__.py | 13 + .../text_normalization/pt/data/__init__.py | 13 + .../pt/data/fractions/ordinal_exceptions.tsv | 2 + .../pt/data/fractions/powers_of_ten.tsv | 1 + .../pt/data/fractions/specials.tsv | 4 + .../pt/data/numbers/__init__.py | 13 + .../pt/data/numbers/cardinal_specials.tsv | 4 + .../numbers/decimal_fractional_specials.tsv | 3 + .../pt/data/numbers/decimal_specials.tsv | 2 + .../pt/data/numbers/digit.tsv | 9 + .../pt/data/numbers/hundreds.tsv | 8 + .../pt/data/numbers/quantity_words.tsv | 9 + .../pt/data/numbers/scales.tsv | 4 + .../pt/data/numbers/teens.tsv | 10 + .../pt/data/numbers/tens.tsv | 8 + .../pt/data/numbers/zero.tsv | 1 + .../pt/data/ordinals/__init__.py | 13 + .../pt/data/ordinals/digit.tsv | 10 + .../pt/data/ordinals/feminine.tsv | 11 + .../pt/data/ordinals/hundreds.tsv | 10 + .../pt/data/ordinals/specials.tsv | 2 + .../pt/data/ordinals/teen.tsv | 9 + .../pt/data/ordinals/ties.tsv | 8 + .../text_normalization/pt/graph_utils.py | 181 +++++++++++ .../text_normalization/pt/taggers/__init__.py | 13 + .../text_normalization/pt/taggers/cardinal.py | 289 ++++++++++++++++++ .../text_normalization/pt/taggers/decimal.py | 147 +++++++++ .../text_normalization/pt/taggers/fraction.py | 146 +++++++++ .../text_normalization/pt/taggers/ordinal.py | 93 ++++++ .../pt/taggers/tokenize_and_classify.py | 115 +++++++ .../text_normalization/pt/utils.py | 50 +++ .../pt/verbalizers/__init__.py | 13 + .../pt/verbalizers/cardinal.py | 67 ++++ .../pt/verbalizers/decimal.py | 80 +++++ .../pt/verbalizers/fraction.py | 118 +++++++ .../pt/verbalizers/ordinal.py | 66 ++++ .../pt/verbalizers/verbalize.py | 43 +++ .../pt/verbalizers/verbalize_final.py | 71 +++++ .../test_cases_cardinal.txt | 118 +++++++ .../test_cases_decimal.txt | 58 ++++ .../test_cases_fraction.txt | 21 ++ .../test_cases_ordinal.txt | 39 +++ .../nemo_text_processing/pt/test_cardinal.py | 11 +- tests/nemo_text_processing/pt/test_decimal.py | 11 +- .../nemo_text_processing/pt/test_fraction.py | 32 ++ tests/nemo_text_processing/pt/test_ordinal.py | 12 +- 47 files changed, 1960 insertions(+), 4 deletions(-) create mode 100644 nemo_text_processing/text_normalization/pt/__init__.py create mode 100644 nemo_text_processing/text_normalization/pt/data/__init__.py create mode 100644 nemo_text_processing/text_normalization/pt/data/fractions/ordinal_exceptions.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/fractions/powers_of_ten.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/fractions/specials.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/numbers/__init__.py create mode 100644 nemo_text_processing/text_normalization/pt/data/numbers/cardinal_specials.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/numbers/decimal_fractional_specials.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/numbers/decimal_specials.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/numbers/digit.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/numbers/hundreds.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/numbers/quantity_words.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/numbers/scales.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/numbers/teens.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/numbers/tens.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/numbers/zero.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/ordinals/__init__.py create mode 100644 nemo_text_processing/text_normalization/pt/data/ordinals/digit.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/ordinals/feminine.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/ordinals/hundreds.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/ordinals/specials.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/ordinals/teen.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/ordinals/ties.tsv create mode 100644 nemo_text_processing/text_normalization/pt/graph_utils.py create mode 100644 nemo_text_processing/text_normalization/pt/taggers/__init__.py create mode 100644 nemo_text_processing/text_normalization/pt/taggers/cardinal.py create mode 100644 nemo_text_processing/text_normalization/pt/taggers/decimal.py create mode 100644 nemo_text_processing/text_normalization/pt/taggers/fraction.py create mode 100644 nemo_text_processing/text_normalization/pt/taggers/ordinal.py create mode 100644 nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/text_normalization/pt/utils.py create mode 100644 nemo_text_processing/text_normalization/pt/verbalizers/__init__.py create mode 100644 nemo_text_processing/text_normalization/pt/verbalizers/cardinal.py create mode 100644 nemo_text_processing/text_normalization/pt/verbalizers/decimal.py create mode 100644 nemo_text_processing/text_normalization/pt/verbalizers/fraction.py create mode 100644 nemo_text_processing/text_normalization/pt/verbalizers/ordinal.py create mode 100644 nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py create mode 100644 nemo_text_processing/text_normalization/pt/verbalizers/verbalize_final.py create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_decimal.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_fraction.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_ordinal.txt create mode 100644 tests/nemo_text_processing/pt/test_fraction.py diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 5e2f9ebb5..3dbe5b138 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -185,6 +185,9 @@ def __init__( if post_process: self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache) + elif lang == 'pt': + from nemo_text_processing.text_normalization.pt.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.pt.verbalizers.verbalize_final import VerbalizeFinalFst elif lang == 'ko': from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/text_normalization/pt/__init__.py b/nemo_text_processing/text_normalization/pt/__init__.py new file mode 100644 index 000000000..212acc24c --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use it except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/pt/data/__init__.py b/nemo_text_processing/text_normalization/pt/data/__init__.py new file mode 100644 index 000000000..9e3fb699d --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/pt/data/fractions/ordinal_exceptions.tsv b/nemo_text_processing/text_normalization/pt/data/fractions/ordinal_exceptions.tsv new file mode 100644 index 000000000..22c9ed16c --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/fractions/ordinal_exceptions.tsv @@ -0,0 +1,2 @@ +segundo meio +terceiro terço diff --git a/nemo_text_processing/text_normalization/pt/data/fractions/powers_of_ten.tsv b/nemo_text_processing/text_normalization/pt/data/fractions/powers_of_ten.tsv new file mode 100644 index 000000000..b19c44364 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/fractions/powers_of_ten.tsv @@ -0,0 +1 @@ +mil milésimo diff --git a/nemo_text_processing/text_normalization/pt/data/fractions/specials.tsv b/nemo_text_processing/text_normalization/pt/data/fractions/specials.tsv new file mode 100644 index 000000000..c140ca4ba --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/fractions/specials.tsv @@ -0,0 +1,4 @@ +connector e +minus menos +plural_suffix s +avos_suffix avos diff --git a/nemo_text_processing/text_normalization/pt/data/numbers/__init__.py b/nemo_text_processing/text_normalization/pt/data/numbers/__init__.py new file mode 100644 index 000000000..9e3fb699d --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/pt/data/numbers/cardinal_specials.tsv b/nemo_text_processing/text_normalization/pt/data/numbers/cardinal_specials.tsv new file mode 100644 index 000000000..04ea91ee4 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/numbers/cardinal_specials.tsv @@ -0,0 +1,4 @@ +connector e +thousand mil +hundred_100 cem +hundred_1 cento diff --git a/nemo_text_processing/text_normalization/pt/data/numbers/decimal_fractional_specials.tsv b/nemo_text_processing/text_normalization/pt/data/numbers/decimal_fractional_specials.tsv new file mode 100644 index 000000000..c84a95f53 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/numbers/decimal_fractional_specials.tsv @@ -0,0 +1,3 @@ +001 mil e um +010 mil e dez +100 mil e cem diff --git a/nemo_text_processing/text_normalization/pt/data/numbers/decimal_specials.tsv b/nemo_text_processing/text_normalization/pt/data/numbers/decimal_specials.tsv new file mode 100644 index 000000000..f6257d9d1 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/numbers/decimal_specials.tsv @@ -0,0 +1,2 @@ +separator vírgula +minus menos diff --git a/nemo_text_processing/text_normalization/pt/data/numbers/digit.tsv b/nemo_text_processing/text_normalization/pt/data/numbers/digit.tsv new file mode 100644 index 000000000..1859416c8 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/numbers/digit.tsv @@ -0,0 +1,9 @@ +1 um +2 dois +3 três +4 quatro +5 cinco +6 seis +7 sete +8 oito +9 nove diff --git a/nemo_text_processing/text_normalization/pt/data/numbers/hundreds.tsv b/nemo_text_processing/text_normalization/pt/data/numbers/hundreds.tsv new file mode 100644 index 000000000..620f512b3 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/numbers/hundreds.tsv @@ -0,0 +1,8 @@ +2 duzentos +3 trezentos +4 quatrocentos +5 quinhentos +6 seiscentos +7 setecentos +8 oitocentos +9 novecentos diff --git a/nemo_text_processing/text_normalization/pt/data/numbers/quantity_words.tsv b/nemo_text_processing/text_normalization/pt/data/numbers/quantity_words.tsv new file mode 100644 index 000000000..a94cbd553 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/numbers/quantity_words.tsv @@ -0,0 +1,9 @@ +mil +milhão +milhões +bilhão +bilhões +trilhão +trilhões +quatrilhão +quatrilhões diff --git a/nemo_text_processing/text_normalization/pt/data/numbers/scales.tsv b/nemo_text_processing/text_normalization/pt/data/numbers/scales.tsv new file mode 100644 index 000000000..a3dffe4e0 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/numbers/scales.tsv @@ -0,0 +1,4 @@ +one_label plural_suffix magnitude_zeros +um milhão milhões 0 +um bilhão bilhões 9 +um trilhão trilhões 12 diff --git a/nemo_text_processing/text_normalization/pt/data/numbers/teens.tsv b/nemo_text_processing/text_normalization/pt/data/numbers/teens.tsv new file mode 100644 index 000000000..299c3dbf2 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/numbers/teens.tsv @@ -0,0 +1,10 @@ +10 dez +11 onze +12 doze +13 treze +14 quatorze +15 quinze +16 dezesseis +17 dezessete +18 dezoito +19 dezenove diff --git a/nemo_text_processing/text_normalization/pt/data/numbers/tens.tsv b/nemo_text_processing/text_normalization/pt/data/numbers/tens.tsv new file mode 100644 index 000000000..43c4a8bc6 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/numbers/tens.tsv @@ -0,0 +1,8 @@ +2 vinte +3 trinta +4 quarenta +5 cinquenta +6 sessenta +7 setenta +8 oitenta +9 noventa diff --git a/nemo_text_processing/text_normalization/pt/data/numbers/zero.tsv b/nemo_text_processing/text_normalization/pt/data/numbers/zero.tsv new file mode 100644 index 000000000..29be0f38b --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/numbers/zero.tsv @@ -0,0 +1 @@ +0 zero diff --git a/nemo_text_processing/text_normalization/pt/data/ordinals/__init__.py b/nemo_text_processing/text_normalization/pt/data/ordinals/__init__.py new file mode 100644 index 000000000..9e3fb699d --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/ordinals/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/pt/data/ordinals/digit.tsv b/nemo_text_processing/text_normalization/pt/data/ordinals/digit.tsv new file mode 100644 index 000000000..5fefbc3b8 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/ordinals/digit.tsv @@ -0,0 +1,10 @@ +primeiro um +segundo dois +terceiro três +quarto quatro +quinto cinco +sexto seis +sétimo sete +oitavo oito +nono nove +décimo dez diff --git a/nemo_text_processing/text_normalization/pt/data/ordinals/feminine.tsv b/nemo_text_processing/text_normalization/pt/data/ordinals/feminine.tsv new file mode 100644 index 000000000..c75ae5ed0 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/ordinals/feminine.tsv @@ -0,0 +1,11 @@ +primeiro primeira +segundo segunda +terceiro terceira +quarto quarta +quinto quinta +sexto sexta +sétimo sétima +oitavo oitava +nono nona +décimo décima +ésimo ésima diff --git a/nemo_text_processing/text_normalization/pt/data/ordinals/hundreds.tsv b/nemo_text_processing/text_normalization/pt/data/ordinals/hundreds.tsv new file mode 100644 index 000000000..6d919a86c --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/ordinals/hundreds.tsv @@ -0,0 +1,10 @@ +centésimo cem +centésimo cento +ducentésimo duzentos +trecentésimo trezentos +quadringentésimo quatrocentos +quincentésimo quinhentos +sexcentésimo seiscentos +septingentésimo setecentos +octingentésimo oitocentos +noningentésimo novecentos diff --git a/nemo_text_processing/text_normalization/pt/data/ordinals/specials.tsv b/nemo_text_processing/text_normalization/pt/data/ordinals/specials.tsv new file mode 100644 index 000000000..bb6933fe6 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/ordinals/specials.tsv @@ -0,0 +1,2 @@ +connector_in e +connector_out diff --git a/nemo_text_processing/text_normalization/pt/data/ordinals/teen.tsv b/nemo_text_processing/text_normalization/pt/data/ordinals/teen.tsv new file mode 100644 index 000000000..1b1e191c9 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/ordinals/teen.tsv @@ -0,0 +1,9 @@ +décimo primeiro onze +décimo segundo doze +décimo terceiro treze +décimo quarto catorze +décimo quinto quinze +décimo sexto dezesseis +décimo sétimo dezessete +décimo oitavo dezoito +décimo nono dezenove diff --git a/nemo_text_processing/text_normalization/pt/data/ordinals/ties.tsv b/nemo_text_processing/text_normalization/pt/data/ordinals/ties.tsv new file mode 100644 index 000000000..f40700034 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/ordinals/ties.tsv @@ -0,0 +1,8 @@ +vigésimo vinte +trigésimo trinta +quadragésimo quarenta +quinquagésimo cinquenta +sexagésimo sessenta +septuagésimo setenta +octogésimo oitenta +nonagésimo noventa diff --git a/nemo_text_processing/text_normalization/pt/graph_utils.py b/nemo_text_processing/text_normalization/pt/graph_utils.py new file mode 100644 index 000000000..0a2c29c0b --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/graph_utils.py @@ -0,0 +1,181 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use it except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +Portuguese (PT) text normalization graph utilities. + +Self-contained module with no dependency on en.graph_utils. Provides character/digit +symbols (NEMO_*), space helpers (delete_space, insert_space, delete_extra_space), +GraphFst base class, generator_main for FAR export, and PT-specific helpers +(filter_cardinal_punctuation, shift_cardinal_gender_pt). +""" + +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.utils.logging import logger + +# ---- Character/digit symbols (same semantics as EN) ---- +NEMO_CHAR = utf8.VALID_UTF8_CHAR +NEMO_DIGIT = byte.DIGIT +NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() +NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() +NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, pynini.accep('"')).optimize() +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross( + pynini.closure(NEMO_WHITE_SPACE, 1), " " +).optimize() + + +def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]) -> None: + """ + Export one or more graphs to an OpenFst Finite State Archive (FAR) file. + + Args: + file_name: path to the output .far file. + graphs: mapping of rule names to FST graphs to export. + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logger.info(f"Created {file_name}") + + +class GraphFst: + """ + Base class for all Portuguese text normalization grammar FSTs. + + Args: + name: name of the grammar (e.g. "cardinal", "decimal"). + kind: either "classify" or "verbalize". + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization). + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path( + os.path.dirname(os.path.abspath(__file__)) + "/grammars/" + kind + "/" + name + ".far" + ) + if self.far_exist(): + self._fst = Far( + self.far_path, mode="r", arc_type="standard", far_type="default" + ).get_fst() + + def far_exist(self) -> bool: + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite( + pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA + ) + + +# ---- PT-specific (Brazilian: 1.000.000 or 1 000 000) ---- +cardinal_separator = pynini.string_map([".", " "]) + + +def filter_cardinal_punctuation(fst: "pynini.FstLike") -> "pynini.FstLike": + """ + Parse digit groups separated by cardinal_separator (e.g. 1.000.000) then apply fst. + + Args: + fst: FST that maps digit string to verbalized cardinal. + + Returns: + Composed FST that accepts digit strings with optional thousand separators. + """ + exactly_three = NEMO_DIGIT**3 + up_to_three = pynini.closure(NEMO_DIGIT, 1, 3) + cardinal_string = pynini.closure(NEMO_DIGIT, 1) + cardinal_string |= ( + up_to_three + + pynutil.delete(cardinal_separator) + + pynini.closure(exactly_three + pynutil.delete(cardinal_separator)) + + exactly_three + ) + return cardinal_string @ fst + + +def shift_cardinal_gender_pt(fst: "pynini.FstLike") -> "pynini.FstLike": + """ + Apply Portuguese masculine-to-feminine conversion for cardinal strings, e.g. + "um" -> "uma", "dois" -> "duas", "duzentos" -> "duzentas". + + Args: + fst: FST producing masculine cardinal verbalization. + + Returns: + FST that produces feminine form when composed with the same input. + """ + fem_ones = pynini.cdrewrite( + pynini.cross("um", "uma"), + "", + pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), + NEMO_SIGMA, + ) + fem_twos = pynini.cdrewrite( + pynini.cross("dois", "duas"), + "", + pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), + NEMO_SIGMA, + ) + fem_hundreds = pynini.cdrewrite( + pynini.cross("entos", "entas"), + pynini.union( + "duz", "trez", "quatroc", "quinh", "seisc", "setec", "oitoc", "novec" + ), + pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), + NEMO_SIGMA, + ) + return fst @ fem_ones @ fem_twos @ fem_hundreds diff --git a/nemo_text_processing/text_normalization/pt/taggers/__init__.py b/nemo_text_processing/text_normalization/pt/taggers/__init__.py new file mode 100644 index 000000000..9e3fb699d --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/pt/taggers/cardinal.py b/nemo_text_processing/text_normalization/pt/taggers/cardinal.py new file mode 100644 index 000000000..15969e6b4 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/taggers/cardinal.py @@ -0,0 +1,289 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from functools import reduce + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_ALPHA, + NEMO_DIGIT, + NEMO_SIGMA, + NEMO_SPACE, + NEMO_WHITE_SPACE, + GraphFst, + delete_space, + insert_space, + filter_cardinal_punctuation, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels + + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying Portuguese cardinals, e.g. + "1000" -> cardinal { integer: "mil" } + "2.000.000" -> cardinal { integer: "dois milhões" } + "-5" -> cardinal { negative: "true" integer: "cinco" } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="classify", deterministic=deterministic) + + specials = { + row[0]: row[1] + for row in load_labels(get_abs_path("data/numbers/cardinal_specials.tsv")) + if len(row) >= 2 + } + connector_e = insert_space + pynutil.insert(specials["connector"]) + insert_space + thousand = specials["thousand"] + hundred_100 = specials["hundred_100"] + hundred_1 = specials["hundred_1"] + + scale_rows = load_labels(get_abs_path("data/numbers/scales.tsv")) + scales = [ + (row[0], row[1], int(row[2])) + for row in scale_rows + if len(row) >= 3 and row[2].strip().isdigit() + ] + + _num = lambda p: pynini.string_file(get_abs_path(f"data/numbers/{p}")) + zero, digit, teens, tens, hundreds = ( + _num("zero.tsv"), _num("digit.tsv"), _num("teens.tsv"), _num("tens.tsv"), _num("hundreds.tsv") + ) + digits_no_one = (NEMO_DIGIT - "1") @ digit + + graph_tens = teens | (tens + (pynutil.delete("0") | (connector_e + digit))) + self.tens = graph_tens.optimize() + self.two_digit_non_zero = pynini.union( + digit, graph_tens, (pynini.cross("0", NEMO_SPACE) + digit) + ).optimize() + + graph_hundreds = hundreds + pynini.union( + pynutil.delete("00"), + (connector_e + graph_tens), + (connector_e + digit), + ) + graph_hundreds |= pynini.cross("100", hundred_100) + graph_hundreds |= pynini.cross("1", hundred_1) + pynini.union( + pynutil.delete("00"), + (connector_e + graph_tens), + (connector_e + pynutil.delete("0") + digit), + ) + self.hundreds = graph_hundreds.optimize() + + h_comp_base = pynini.union(graph_hundreds, pynutil.delete("0") + graph_tens) + h_comp = h_comp_base | (pynutil.delete("00") + digit) + h_comp_no_one = h_comp_base | (pynutil.delete("00") + digits_no_one) + + pure_tens_input = pynini.union(*[pynini.accep(str(d * 10)) for d in range(1, 10)]) + graph_pure_tens_only = pure_tens_input @ graph_tens + graph_compound_tens = (pynini.closure(NEMO_DIGIT, 2, 2) - pure_tens_input) @ graph_tens + + graph_pure_components = pynini.union( + pynutil.delete("0") + graph_pure_tens_only, + pynutil.delete("00") + digit, + hundreds + pynutil.delete("00"), + pynini.cross("100", hundred_100), + ) + graph_compound_hundreds = pynini.union( + pynini.cross("1", hundred_1) + + pynini.union( + (connector_e + graph_tens), + (connector_e + pynutil.delete("0") + digit), + ), + hundreds + pynini.union( + (connector_e + graph_tens), + (connector_e + digit), + ), + ) + + suffix_after_mil = pynini.union( + pynutil.delete("000"), + (connector_e + graph_pure_components), + (insert_space + graph_compound_hundreds), + (insert_space + pynutil.delete("0") + graph_compound_tens), + ) + + t_comp = pynini.union( + pynutil.delete("000") + h_comp, + h_comp_no_one + insert_space + pynutil.insert(thousand) + suffix_after_mil, + pynini.cross("001", thousand) + suffix_after_mil, + ) + t_comp_no_one = pynini.union( + pynutil.delete("000") + h_comp_no_one, + h_comp_no_one + insert_space + pynutil.insert(thousand) + + ((insert_space + h_comp) | pynutil.delete("000")), + pynini.cross("001", thousand) + ((insert_space + h_comp) | pynutil.delete("000")), + ) + + graph_large_scales = pynini.accep("") + for one_label, plural_suffix, _ in reversed(scales): + g = pynutil.add_weight(pynini.cross("000001", one_label), -0.001) + g |= t_comp_no_one + pynutil.insert(plural_suffix) + g |= pynutil.delete("000000") + g += insert_space + graph_large_scales += g + + # 9/12-digit: scale block + trailing (million+thousands, billion+9digits) + scale_3_mil = self._scale_block_3(scales[0][0], scales[0][1], h_comp_no_one) + scale_3_bi = self._scale_block_3(scales[1][0], scales[1][1], h_comp_no_one) + graph_9 = self._build_scale_trailing_graph(scale_3_mil, t_comp, 6, 9) + graph_12 = self._build_scale_trailing_graph(scale_3_bi, graph_9, 9, 12) + pure_9, pure_12 = self._pure_inputs(9), self._pure_inputs(12) + trail_9 = (pure_9 @ graph_9, (NEMO_DIGIT**9 - pure_9) @ graph_9) + trail_12 = (pure_12 @ graph_12, (NEMO_DIGIT**12 - pure_12) @ graph_12) + + # Units 6 (u6): pure get "e" after scale; compound no "e" + u6_one = pynini.cross("000001", "1") @ digit + u6_pure = pynini.union( + u6_one, pynini.cross("001000", thousand), + pynini.cross("000010", "10") @ graph_tens, pynini.cross("000100", hundred_100), + (pynini.cross("010000", "10") @ graph_tens) + insert_space + pynutil.insert(thousand), + pynini.cross("100000", hundred_100) + insert_space + pynutil.insert(thousand), + ) + u6_compound = (NEMO_DIGIT**6 - self._pure_inputs(6)) @ t_comp + u6 = u6_pure | u6_compound + z18 = pynini.accep("0" * 18) # 18 zeros: branch no "e" + smaller_e = (connector_e + u6_pure) | u6_compound | pynutil.delete("0" * 6) + smaller = u6 | pynutil.delete("0" * 6) + graph_24 = ( + ((NEMO_DIGIT**18 - z18) + NEMO_DIGIT**6) @ (graph_large_scales + smaller_e) + ) | ((z18 + NEMO_DIGIT**6) @ (pynutil.delete(z18) + smaller)) + + trail_by_z = {9: trail_9, 12: trail_12} + magnitude_patterns = [ + self._build_magnitude_pattern( + one_label, plural_suffix, magnitude_zeros, trail_by_z.get(magnitude_zeros), + connector_e, insert_space, digit, graph_tens, graph_hundreds, + ) + for one_label, plural_suffix, magnitude_zeros in scales + if magnitude_zeros > 0 + ] + + pad = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0) + pad = pad @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) @ NEMO_DIGIT**24 + norm = pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) + norm = norm @ pynini.cdrewrite(pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 2), NEMO_SPACE), NEMO_ALPHA, NEMO_ALPHA, NEMO_SIGMA) + self.graph = reduce(lambda a, b: a | b, magnitude_patterns, pad @ graph_24 @ norm) | zero + self.graph = filter_cardinal_punctuation(self.graph).optimize() + + optional_minus_graph = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1 + ) + final_graph = ( + optional_minus_graph + + pynutil.insert("integer: \"") + + self.graph + + pynutil.insert("\"") + ) + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() + + def _scale_block_3(self, one_label, plural_suffix, component_no_one): + """001->one_label, 000->'', else component+plural.""" + return pynini.union( + pynini.cross("001", one_label), + pynini.cross("000", ""), + (NEMO_DIGIT**3 - pynini.accep("001") - pynini.accep("000")) + @ (component_no_one + insert_space + pynutil.insert(plural_suffix)), + ) + + def _build_scale_trailing_graph(self, scale_3, sub_graph, trailing_len, total_len): + """total_len digits = scale_3 + trailing; no trailing space when trailing all zeros.""" + zt, ztotal = "0" * trailing_len, "0" * total_len + scale_nonzero = NEMO_DIGIT**3 - pynini.accep("000") + branches = [ + (pynini.accep("000") + NEMO_DIGIT**trailing_len) @ (pynutil.delete("000") + sub_graph), + (scale_nonzero + (NEMO_DIGIT**trailing_len - pynini.accep(zt))) @ (scale_3 + insert_space + sub_graph), + (scale_nonzero + pynini.accep(zt)) @ (scale_3 + pynutil.delete(zt)), + (pynini.accep("000") + pynini.accep(zt)) @ pynutil.delete(ztotal), + ] + return pynini.union(*branches) + + @staticmethod + def _pure_inputs(num_digits): + """Inputs 1, 10, 100, ... as num_digits-digit strings.""" + return pynini.union( + *[pynini.accep(str(10**k).zfill(num_digits)) for k in range(0, num_digits)] + ) + + def _magnitude_graph( + self, one_word, plural_suffix, zero_count, graph_digit, graph_tens, graph_hundreds, + connector_e, insert_space, trailing_pair=None, + ): + """Round (1–3 digit + scale + zeros); optional trailing (e + pure | space + compound).""" + zeros = "0" * zero_count + round_pats = [] + trail_pats = [] if trailing_pair else None + for L in (1, 2, 3): + total = zero_count + L + if L == 1: + lead = pynini.cross("1", one_word) | ( + (NEMO_DIGIT - "1") @ graph_digit + pynutil.insert(plural_suffix) + ) + else: + lead = ( + pynini.closure(NEMO_DIGIT, L, L) + @ (graph_tens if L == 2 else graph_hundreds) + + pynutil.insert(plural_suffix) + ) + lead_fst = NEMO_DIGIT**L @ lead + round_pats.append( + pynini.closure(NEMO_DIGIT, total, total) @ (lead_fst + pynutil.delete(zeros)) + ) + if trailing_pair: + pure, compound = trailing_pair + trail_part = ( + NEMO_DIGIT**zero_count @ (connector_e + pure) + | NEMO_DIGIT**zero_count @ (insert_space + compound) + ) + trail_pats.append( + pynini.closure(NEMO_DIGIT, total, total) @ (lead_fst + trail_part) + ) + graph_round = pynini.union(*round_pats) + graph_trail = pynini.union(*trail_pats) if trail_pats else None + return graph_round, graph_trail + + def _build_magnitude_pattern( + self, + one_label, plural_suffix, magnitude_zeros, + trailing_pair, + connector_e, insert_space, + graph_digit, graph_tens, graph_hundreds, + ): + """Restrict length; round + optional non-zero trailing.""" + restrict = (NEMO_DIGIT - "0") + pynini.closure( + NEMO_DIGIT, magnitude_zeros, magnitude_zeros + 2 + ) + graph_round, graph_trail = self._magnitude_graph( + one_label, plural_suffix, magnitude_zeros, + graph_digit, graph_tens, graph_hundreds, + connector_e, insert_space, trailing_pair, + ) + if graph_trail is None: + return pynutil.add_weight(restrict @ graph_round, -1.0) + non_zero_trail = pynini.union( + *[ + NEMO_DIGIT**n + (NEMO_DIGIT**magnitude_zeros - pynini.accep("0" * magnitude_zeros)) + for n in (1, 2, 3) + ] + ) + return pynutil.add_weight(restrict @ (graph_round | (non_zero_trail @ graph_trail)), -1.0) diff --git a/nemo_text_processing/text_normalization/pt/taggers/decimal.py b/nemo_text_processing/text_normalization/pt/taggers/decimal.py new file mode 100644 index 000000000..440c451c3 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/taggers/decimal.py @@ -0,0 +1,147 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_DIGIT, + GraphFst, + insert_space, + delete_space +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying Portuguese decimal numbers, e.g. + "1,26" -> decimal { integer_part: "um" fractional_part: "vinte e seis" } + "0,01" -> decimal { integer_part: "zero" fractional_part: "um" } (leading zeros stripped) + "1,001" -> decimal { integer_part: "um" fractional_part: "mil e um" } (data: decimal_fractional_specials) + "-1,26" -> decimal { negative: "true" ... } + "1,33 milhões" / "1 milhão" -> decimal { ... quantity: "milhões" / "milhão" } + + Args: + cardinal: CardinalFst instance for integer verbalization in tags. + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="decimal", kind="classify", deterministic=deterministic) + cardinal_graph = cardinal.graph + _num = lambda name: pynini.string_file( + get_abs_path(f"data/numbers/{name}") + ).optimize() + + comma = pynutil.delete(",") + quantity_words = _num("quantity_words.tsv") + digit = _num("digit.tsv") + zero = _num("zero.tsv") + fractional_specials = _num("decimal_fractional_specials.tsv") + graph_digit_or_zero = pynini.union(digit, zero) + digit_by_digit = ( + graph_digit_or_zero + + pynini.closure(insert_space + graph_digit_or_zero) + ).optimize() + + # Fractional: strip leading zeros → rest @ cardinal; all zeros → "zero" + delete_leading_zero = pynini.cross("0", "") + rest = pynini.difference(NEMO_DIGIT, pynini.accep("0")) + pynini.closure( + NEMO_DIGIT, 0 + ) + with_rest = (pynini.closure(delete_leading_zero, 0) + rest) @ ( + pynini.closure(NEMO_DIGIT, 1) @ cardinal_graph + ) + only_zeros = pynini.closure(delete_leading_zero, 1) + pynini.cross( + "0", "zero" + ) + fractional_strip = pynini.union(with_rest, only_zeros).optimize() + # Prefer specials (001→mil e um, 010→mil e dez, 100→mil e cem) over strip when both match + fractional_with_specials = pynini.union( + pynutil.add_weight(fractional_specials, -0.01), + fractional_strip, + ).optimize() + + fractional_short = pynini.closure(NEMO_DIGIT, 1, 9) + fractional_long = pynini.closure(NEMO_DIGIT, 10, 15) + non_zero_lead = pynini.difference(NEMO_DIGIT, pynini.accep("0")) + + # Integer "0" → fractional strip only (no specials) + graph_integer_zero = ( + pynutil.insert('integer_part: "') + + pynini.cross("0", "zero") + + pynutil.insert('"') + + insert_space + ) + graph_fractional_zero = ( + pynutil.insert('fractional_part: "') + + pynini.union( + fractional_short @ fractional_strip, + fractional_long @ digit_by_digit, + ) + + pynutil.insert('"') + ) + decimal_when_zero = ( + graph_integer_zero + comma + insert_space + graph_fractional_zero + ) + + # Integer non-zero → fractional: specials | strip + cardinal | digit-by-digit + graph_integer_pos = ( + pynutil.insert('integer_part: "') + + (non_zero_lead + pynini.closure(NEMO_DIGIT, 0, 11)) @ cardinal_graph + + pynutil.insert('"') + + insert_space + ) + graph_fractional_pos = ( + pynutil.insert('fractional_part: "') + + pynini.union( + fractional_short @ fractional_with_specials, + fractional_long @ digit_by_digit, + ) + + pynutil.insert('"') + ) + decimal_when_pos = ( + graph_integer_pos + comma + insert_space + graph_fractional_pos + ) + + decimal_core = pynini.union(decimal_when_zero, decimal_when_pos) + integer_quantity = ( + pynutil.insert('integer_part: "') + + (pynini.closure(NEMO_DIGIT, 1, 12) @ cardinal_graph) + + pynutil.insert('"') + + insert_space + + delete_space + + pynutil.insert('quantity: "') + + quantity_words + + pynutil.insert('"') + ) + decimal_quantity = ( + decimal_core + + delete_space + + pynutil.insert('quantity: "') + + quantity_words + + pynutil.insert('"') + ) + final_graph_wo_sign = pynini.union( + decimal_core, integer_quantity, decimal_quantity + ) + optional_minus = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("-", '"true" '), 0, 1 + ) + final_graph = optional_minus + final_graph_wo_sign + + self.fst = self.add_tokens(final_graph).optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/fraction.py b/nemo_text_processing/text_normalization/pt/taggers/fraction.py new file mode 100644 index 000000000..4bfe84087 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/taggers/fraction.py @@ -0,0 +1,146 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_DIGIT, + NEMO_SIGMA, + NEMO_WHITE_SPACE, + insert_space, + GraphFst, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels + + +class FractionFst(GraphFst): + """ + Finite state transducer for classifying Portuguese fraction numbers, e.g. + "1/2" -> fraction { numerator: "um" denominator: "meio" morphosyntactic_features: "ordinal" } + "2 3/4" -> fraction { integer_part: "dois" numerator: "três" denominator: "quarto" ... } + "2/11" -> fraction { numerator: "dois" denominator: "onze" morphosyntactic_features: "avos" } + + Args: + cardinal: CardinalFst instance for number parts. + ordinal: OrdinalFst instance for denominator 2-10 and exceptions. + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = True): + super().__init__(name="fraction", kind="classify", deterministic=deterministic) + cardinal_graph = cardinal.graph + + # Denominators 2–10 use ordinal form (no data file: fixed set) + two_to_ten = pynini.union( + *[pynini.accep(str(d)) for d in range(2, 11)] + ).optimize() + + ord_digit_rows = load_labels(get_abs_path("data/ordinals/digit.tsv")) + ordinal_digit = pynini.string_map( + [(r[1], r[0]) for r in ord_digit_rows if len(r) >= 2] + ).optimize() + + ord_exc_rows = load_labels(get_abs_path("data/fractions/ordinal_exceptions.tsv")) + ordinal_exceptions = pynini.string_map( + [(r[0], r[1]) for r in ord_exc_rows if len(r) >= 2] + ).optimize() + + ord_hundreds_rows = load_labels(get_abs_path("data/ordinals/hundreds.tsv")) + ordinal_hundreds = pynini.string_map( + [(r[1], r[0]) for r in ord_hundreds_rows if len(r) >= 2] + ).optimize() + + powers_rows = load_labels(get_abs_path("data/fractions/powers_of_ten.tsv")) + powers_of_ten = pynini.string_map( + [(r[0], r[1]) for r in powers_rows if len(r) >= 2] + ).optimize() + + denom_ordinal_form = two_to_ten @ cardinal_graph @ ordinal_digit + denom_ordinal_form = denom_ordinal_form @ pynini.cdrewrite( + ordinal_exceptions, "", "", NEMO_SIGMA + ) + denom_ordinal = ( + pynutil.insert('denominator: "') + + denom_ordinal_form + + pynutil.insert('" morphosyntactic_features: "ordinal"') + ) + + denom_100 = ( + pynutil.insert('denominator: "') + + (pynini.accep("100") @ cardinal_graph @ ordinal_hundreds) + + pynutil.insert('" morphosyntactic_features: "ordinal"') + ) + denom_1000 = ( + pynutil.insert('denominator: "') + + (pynini.accep("1000") @ cardinal_graph @ powers_of_ten) + + pynutil.insert('" morphosyntactic_features: "ordinal"') + ) + + denom_ordinal_2_10_100_1000 = pynini.union( + denom_ordinal, denom_100, denom_1000 + ) + digit_plus = pynini.closure(NEMO_DIGIT, 1) + denom_avos_input = pynini.difference( + digit_plus, + pynini.union( + two_to_ten, + pynini.accep("100"), + pynini.accep("1000"), + ), + ) + denom_avos = ( + pynutil.insert('denominator: "') + + (denom_avos_input @ cardinal_graph) + + pynutil.insert('" morphosyntactic_features: "avos"') + ) + + denominator = pynini.union(denom_ordinal_2_10_100_1000, denom_avos) + + # Slash variants: ASCII /, Unicode ⁄ (U+2044), ∕ (U+2215); with or without spaces + slash_or_space_slash = pynini.union( + pynini.cross("/", '" '), + pynini.cross(" / ", '" '), + pynini.cross("\u2044", '" '), # fraction slash ⁄ + pynini.cross(" \u2044 ", '" '), + pynini.cross("\u2215", '" '), # division slash ∕ + pynini.cross(" \u2215 ", '" '), + ) + numerator = ( + pynutil.insert('numerator: "') + + cardinal_graph + + slash_or_space_slash + ) + fraction_core = numerator + denominator + + integer_part = ( + pynutil.insert('integer_part: "') + + cardinal_graph + + pynutil.insert('"') + + insert_space + ) + + optional_minus = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("-", '"true" '), 0, 1 + ) + + mixed = ( + integer_part + + pynini.closure(NEMO_WHITE_SPACE, 1) + + fraction_core + ) + graph = optional_minus + pynini.union(mixed, fraction_core) + + self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/ordinal.py b/nemo_text_processing/text_normalization/pt/taggers/ordinal.py new file mode 100644 index 000000000..7f5a62d9d --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/taggers/ordinal.py @@ -0,0 +1,93 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_DIGIT, + GraphFst, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying Portuguese ordinals, e.g. + "1º" / "1ª" -> ordinal { integer: "primeiro" / "primeira" morphosyntactic_features: "gender_masc" / "gender_fem" } + "21º" -> ordinal { integer: "vigésimo primeiro" morphosyntactic_features: "gender_masc" } + + Args: + cardinal: CardinalFst instance for composing compound ordinals. + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="ordinal", kind="classify", deterministic=deterministic) + cardinal_graph = cardinal.graph + + spec_rows = load_labels(get_abs_path("data/ordinals/specials.tsv")) + spec = {r[0]: r[1] for r in spec_rows if len(r) >= 2} + conn_in = spec.get("connector_in", " e ") + conn_out = spec.get("connector_out", " ") + conn = pynini.cross(conn_in, conn_out) + + # Data: ordinal \t cardinal → FST cardinal→ordinal via load_labels + digit_rows = load_labels(get_abs_path("data/ordinals/digit.tsv")) + graph_digit = pynini.string_map([(r[1], r[0]) for r in digit_rows if len(r) >= 2]).optimize() + teen_rows = load_labels(get_abs_path("data/ordinals/teen.tsv")) + graph_teens = pynini.string_map([(r[1], r[0]) for r in teen_rows if len(r) >= 2]).optimize() + ties_rows = load_labels(get_abs_path("data/ordinals/ties.tsv")) + graph_ties = pynini.string_map([(r[1], r[0]) for r in ties_rows if len(r) >= 2]).optimize() + hundreds_rows = load_labels(get_abs_path("data/ordinals/hundreds.tsv")) + graph_hundreds = pynini.string_map([(r[1], r[0]) for r in hundreds_rows if len(r) >= 2]).optimize() + + graph_tens = pynini.union( + graph_teens, + graph_ties + pynini.closure(conn + graph_digit, 0, 1), + ) + graph_hundred_component = pynini.union( + graph_hundreds + + pynini.closure( + conn + pynini.union(graph_tens, graph_digit), 0, 1 + ), + graph_tens, + graph_digit, + ) + ordinal_rewrite = graph_hundred_component.optimize() + ordinal_inner = cardinal_graph @ ordinal_rewrite + + opt_dot = pynini.closure(pynutil.delete("."), 0, 1) + suffix_masc = opt_dot + pynutil.delete(pynini.union("º", "°")) + suffix_fem = opt_dot + pynutil.delete("ª") + digit_block = pynini.closure(NEMO_DIGIT, 1, 3) + + to_ordinal_masc = (digit_block + suffix_masc) @ ordinal_inner + to_ordinal_fem = (digit_block + suffix_fem) @ ordinal_inner + + graph_masc = ( + pynutil.insert('integer: "') + + to_ordinal_masc + + pynutil.insert('" morphosyntactic_features: "gender_masc"') + ) + graph_fem = ( + pynutil.insert('integer: "') + + to_ordinal_fem + + pynutil.insert('" morphosyntactic_features: "gender_fem"') + ) + self.fst = self.add_tokens( + pynini.union(graph_masc, graph_fem) + ).optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..686fb8f6b --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py @@ -0,0 +1,115 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.pt.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.pt.taggers.decimal import DecimalFst +from nemo_text_processing.text_normalization.pt.taggers.fraction import FractionFst +from nemo_text_processing.text_normalization.pt.taggers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.en.taggers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.en.taggers.word import WordFst +from nemo_text_processing.utils.logging import logger + + +class ClassifyFst(GraphFst): + """ + Final class that composes all Portuguese classification grammars. This class can process an entire sentence (lower cased). + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files. + whitelist: path to a file with whitelist replacements. + """ + + def __init__( + self, + input_case: str, + deterministic: bool = False, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic) + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + whitelist_file = os.path.basename(whitelist) if whitelist else "" + far_file = os.path.join( + cache_dir, + f"_{input_case}_pt_tn_{deterministic}_deterministic{whitelist_file}.far", + ) + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logger.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logger.info(f"Creating ClassifyFst grammars. This might take some time...") + + # Initialize Portuguese taggers + cardinal = CardinalFst(deterministic=deterministic) + ordinal = OrdinalFst(cardinal, deterministic=deterministic) + fraction = FractionFst(cardinal, ordinal, deterministic=deterministic) + decimal = DecimalFst(cardinal, deterministic=deterministic) + + punctuation = PunctuationFst(deterministic=deterministic) + word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).fst + whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist) + + classify = ( + pynutil.add_weight(whitelist.fst, 1.01) + | pynutil.add_weight(fraction.fst, 1.1) + | pynutil.add_weight(decimal.fst, 1.1) + | pynutil.add_weight(ordinal.fst, 1.1) + | pynutil.add_weight(cardinal.fst, 1.1) + | pynutil.add_weight(word_graph, 100) + ) + + # Wrap tokens properly + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + punct_graph = pynutil.insert("tokens { ") + pynutil.add_weight(punctuation.fst, weight=2.1) + pynutil.insert(" }") + + # Simple graph structure + graph = token + pynini.closure( + pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) + token + ) + + # Allow punctuation + graph |= punct_graph + + self.fst = delete_space + graph + delete_space + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logger.info(f"ClassifyFst grammars are saved to {far_file}.") + + +if __name__ == "__main__": + ClassifyFst(input_case="cased", deterministic=False) \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/pt/utils.py b/nemo_text_processing/text_normalization/pt/utils.py new file mode 100644 index 000000000..e6e0c51fd --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/utils.py @@ -0,0 +1,50 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use it except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Portuguese (PT) text normalization utilities. + +Provides get_abs_path for resolving data paths and load_labels for reading TSV label files. +""" +import csv +import os + + +def get_abs_path(rel_path: str) -> str: + """ + Resolve a path relative to this module to an absolute path. + + Args: + rel_path: path relative to the PT text normalization data directory. + + Returns: + Absolute path string. + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + +def load_labels(abs_path: str): + """ + Load a TSV file as a list of rows (list of lists). + + Args: + abs_path: absolute path to a UTF-8 TSV file. + + Returns: + List of rows, each row a list of fields (e.g. from csv.reader). + """ + with open(abs_path, encoding="utf-8") as label_tsv: + labels = list(csv.reader(label_tsv, delimiter="\t")) + return labels + diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/__init__.py b/nemo_text_processing/text_normalization/pt/verbalizers/__init__.py new file mode 100644 index 000000000..9e3fb699d --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/pt/verbalizers/cardinal.py new file mode 100644 index 000000000..b05d2c8ca --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/cardinal.py @@ -0,0 +1,67 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + shift_cardinal_gender_pt, +) + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing Portuguese cardinal numbers, e.g. + cardinal { integer: "dois" } -> dois + cardinal { integer: "dois" } -> duas (feminine context via shift_cardinal_gender_pt) + cardinal { negative: "true" integer: "cinco" } -> menos cinco + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic) + optional_sign = pynini.closure(pynini.cross("negative: \"true\" ", "menos "), 0, 1) + self.optional_sign = optional_sign + + integer = pynini.closure(NEMO_NOT_QUOTE, 1) + self.integer = pynutil.delete(" \"") + integer + pynutil.delete("\"") + + integer = pynutil.delete("integer:") + self.integer + + # Generate masculine form (default) + graph_masc = optional_sign + integer + + # Generate feminine form using Portuguese gender conversion + graph_fem = shift_cardinal_gender_pt(graph_masc) + + self.graph_masc = pynini.optimize(graph_masc) + self.graph_fem = pynini.optimize(graph_fem) + + # Default to masculine for standalone numbers + # Context-aware gender selection will be handled by higher-level components + graph = graph_masc + + if not deterministic: + # For alternate renderings and contractions + # Portuguese doesn't have apocope like Spanish, but may have contractions + pass + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/decimal.py b/nemo_text_processing/text_normalization/pt/verbalizers/decimal.py new file mode 100644 index 000000000..cad0656e6 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/decimal.py @@ -0,0 +1,80 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use it except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels + + +class DecimalFst(GraphFst): + """ + Finite state transducer for verbalizing Portuguese decimal numbers, e.g. + decimal { integer_part: "um" fractional_part: "vinte e seis" } -> um vírgula vinte e seis + decimal { negative: "true" integer_part: "um" ... } -> menos um vírgula ... + decimal { integer_part: "um" quantity: "milhão" } -> um milhão + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="decimal", kind="verbalize", deterministic=deterministic) + labels = load_labels(get_abs_path("data/numbers/decimal_specials.tsv")) + spec = {r[0]: r[1] for r in labels if len(r) >= 2} + sep = spec.get("separator", "vírgula") + minus = spec.get("minus", "menos") + + optional_sign = pynini.closure( + pynini.cross('negative: "true" ', minus + " "), 0, 1 + ) + + integer = ( + pynutil.delete('integer_part: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + fractional = ( + pynutil.delete('fractional_part: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + quantity = ( + delete_space + + insert_space + + pynutil.delete('quantity: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + integer_quantity = integer + quantity + decimal_part = ( + integer + + delete_space + + insert_space + + pynutil.insert(sep + " ") + + fractional + + pynini.closure(quantity, 0, 1) + ) + + graph = optional_sign + pynini.union(integer_quantity, decimal_part) + + self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/fraction.py b/nemo_text_processing/text_normalization/pt/verbalizers/fraction.py new file mode 100644 index 000000000..a9ac5b2bd --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/fraction.py @@ -0,0 +1,118 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use it except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + insert_space, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels + + +class FractionFst(GraphFst): + """ + Finite state transducer for verbalizing Portuguese fraction numbers, e.g. + fraction { numerator: "um" denominator: "meio" morphosyntactic_features: "ordinal" } -> um meio + fraction { integer_part: "dois" numerator: "três" denominator: "quarto" } -> dois e três quartos + fraction { numerator: "dois" denominator: "onze" morphosyntactic_features: "avos" } -> dois onze avos + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) + labels = load_labels(get_abs_path("data/fractions/specials.tsv")) + spec = {r[0]: r[1] for r in labels if len(r) >= 2} + connector = pynutil.insert(spec.get("connector", " e ")) + minus = spec.get("minus", "menos ") + plural_suffix = spec.get("plural_suffix", "s") + avos_suffix = spec.get("avos_suffix", " avos") + numerator_one_val = spec.get("numerator_one", "um") + denominator_half_val = spec.get("denominator_half", "meio") + + optional_sign = pynini.closure( + pynini.cross('negative: "true" ', minus) + insert_space, 0, 1 + ) + + integer = ( + pynutil.delete('integer_part: "') + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.delete('" ') + ) + + numerator_one = ( + pynutil.delete('numerator: "') + + pynini.accep(numerator_one_val) + + pynutil.delete('" ') + ) + numerator_rest = ( + pynutil.delete('numerator: "') + + pynini.difference( + pynini.closure(NEMO_NOT_QUOTE), pynini.accep(numerator_one_val) + ) + + pynutil.delete('" ') + ) + + denom_ordinal = ( + pynutil.delete('denominator: "') + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.delete('" morphosyntactic_features: "ordinal"') + ) + denom_meio = ( + pynutil.delete('denominator: "') + + pynini.accep(denominator_half_val) + + pynutil.delete('" morphosyntactic_features: "ordinal"') + ) + denom_avos = ( + pynutil.delete('denominator: "') + + pynini.closure(NEMO_NOT_QUOTE) + + pynutil.delete('" morphosyntactic_features: "avos"') + ) + + fraction_ordinal_singular = numerator_one + insert_space + denom_ordinal + fraction_ordinal_plural = ( + numerator_rest + insert_space + denom_ordinal + pynutil.insert(plural_suffix) + ) + fraction_ordinal = pynini.union( + fraction_ordinal_singular, fraction_ordinal_plural + ) + + fraction_avos = ( + pynini.union(numerator_one, numerator_rest) + + insert_space + + denom_avos + + pynutil.insert(avos_suffix) + ) + + fraction = pynini.union(fraction_ordinal, fraction_avos) + mixed_um_meio = ( + integer + + connector + + pynutil.delete('numerator: "' + numerator_one_val + '" " ') + + denom_meio + ) + optional_integer = pynini.closure( + integer + connector + insert_space, 0, 1 + ) + graph = optional_sign + pynini.union( + pynutil.add_weight(mixed_um_meio, -0.01), + optional_integer + fraction, + ) + + self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/pt/verbalizers/ordinal.py new file mode 100644 index 000000000..c495d0328 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/ordinal.py @@ -0,0 +1,66 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use it except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_NOT_QUOTE, + NEMO_SIGMA, + NEMO_SPACE, + GraphFst, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for verbalizing Portuguese ordinals, e.g. + ordinal { integer: "primeiro" morphosyntactic_features: "gender_masc" } -> primeiro + ordinal { integer: "primeira" morphosyntactic_features: "gender_fem" } -> primeira (feminine rewrite applied) + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) + integer = ( + pynutil.delete('integer: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + fem_rewrite = pynini.string_file( + get_abs_path("data/ordinals/feminine.tsv") + ) + feminine_rewrite = pynini.cdrewrite( + fem_rewrite, + "", + pynini.union(NEMO_SPACE, pynini.accep("[EOS]")), + NEMO_SIGMA, + ) + + graph_masc = ( + integer + + pynutil.delete(' morphosyntactic_features: "gender_masc"') + ) + graph_fem = ( + (integer @ feminine_rewrite) + + pynutil.delete(' morphosyntactic_features: "gender_fem"') + ) + self.fst = self.delete_tokens( + pynini.union(graph_masc, graph_fem) + ).optimize() diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py new file mode 100644 index 000000000..4fa03465b --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py @@ -0,0 +1,43 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use it except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from nemo_text_processing.text_normalization.pt.graph_utils import GraphFst +from nemo_text_processing.text_normalization.pt.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.pt.verbalizers.decimal import DecimalFst +from nemo_text_processing.text_normalization.pt.verbalizers.fraction import FractionFst +from nemo_text_processing.text_normalization.pt.verbalizers.ordinal import OrdinalFst + + +class VerbalizeFst(GraphFst): + """ + Composes Portuguese verbalizer grammars (cardinal, ordinal, fraction, decimal). + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic) + + cardinal = CardinalFst(deterministic=deterministic) + ordinal = OrdinalFst(deterministic=deterministic) + fraction = FractionFst(deterministic=deterministic) + decimal = DecimalFst(deterministic=deterministic) + graph = fraction.fst | decimal.fst | ordinal.fst | cardinal.fst + + self.fst = graph \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize_final.py new file mode 100644 index 000000000..5b6488bd1 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize_final.py @@ -0,0 +1,71 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use it except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.pt.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.utils.logging import logger + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire Portuguese sentence, e.g. + tokens { cardinal { integer: "dois" } } tokens { name: "e" } tokens { cardinal { integer: "três" } } -> dois e três + + Args: + deterministic: if True will provide a single transduction option, + for False multiple options (used for audio-based normalization) + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"pt_tn_{deterministic}_deterministic_verbalizer.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + logger.info(f'VerbalizeFinalFst graph was restored from {far_file}.') + else: + + verbalize = VerbalizeFst(deterministic=deterministic).fst + word = WordFst(deterministic=deterministic).fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + + self.fst = graph.optimize() + if far_file: + generator_main(far_file, {"verbalize": self.fst}) \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..d4c0b33fc --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,118 @@ +0~zero +1~um +2~dois +3~três +4~quatro +5~cinco +6~seis +7~sete +8~oito +9~nove +10~dez +11~onze +12~doze +13~treze +14~catorze +15~quinze +16~dezesseis +17~dezessete +18~dezoito +19~dezenove +20~vinte +21~vinte e um +22~vinte e dois +23~vinte e três +24~vinte e quatro +25~vinte e cinco +26~vinte e seis +27~vinte e sete +28~vinte e oito +29~vinte e nove +30~trinta +40~quarenta +50~cinquenta +60~sessenta +70~setenta +80~oitenta +90~noventa +100~cem +101~cento e um +102~cento e dois +110~cento e dez +120~cento e vinte +130~cento e trinta +200~duzentos +300~trezentos +400~quatrocentos +500~quinhentos +600~seiscentos +700~setecentos +800~oitocentos +900~novecentos +1000~mil +1 000~mil +1.000~mil +1010~mil e dez +1020~mil e vinte +1100~mil e cem +1110~mil cento e dez +1111~mil cento e onze +2000~dois mil +2002~dois mil e dois +2010~dois mil e dez +2020~dois mil e vinte +2100~dois mil e cem +2110~dois mil cento e dez +2111~dois mil cento e onze +10000~dez mil +10 000~dez mil +10.000~dez mil +100000~cem mil +100 000~cem mil +100.000~cem mil +1 000 000~um milhão +1.000.000~um milhão +1 034 068~um milhão trinta e quatro mil sessenta e oito +2.000.000~dois milhões +1.000.000.000~um bilhão +1000000000~um bilhão +2.000.000.000~dois bilhões +2000000000~dois bilhões +3 000 000 000 000~três trilhões +3.000.000.000.000~três trilhões +1001~mil e um +1010~mil e dez +1100~mil e cem +1101~mil cento e um +1111~mil cento e onze +1999~mil novecentos e noventa e nove +100000~cem mil +100001~cem mil e um +101000~cento e um mil +101001~cento e um mil e um +110000~cento e dez mil +111000~cento e onze mil +111111~cento e onze mil cento e onze +1001000~um milhão e mil +1001001~um milhão mil e um +1010000~um milhão e dez mil +1010101~um milhão dez mil cento e um +1100000~um milhão e cem mil +1110000~um milhão cento e dez mil +1001010101~um bilhão um milhão dez mil cento e um +1010101010~um bilhão dez milhões cento e um mil e dez +1234567890~um bilhão duzentos e trinta e quatro milhões quinhentos e sessenta e sete mil oitocentos e noventa +987654321~novecentos e oitenta e sete milhões seiscentos e cinquenta e quatro mil trezentos e vinte e um +999999999~novecentos e noventa e nove milhões novecentos e noventa e nove mil novecentos e noventa e nove +2000000001~dois bilhões e um +3000001000~três bilhões e mil +4000100000~quatro bilhões e cem mil +5000000100~cinco bilhões e cem +6001000000~seis bilhões e um milhão +1000000000000~um trilhão +1000000000001~um trilhão e um +1230000000000~um trilhão duzentos e trinta bilhões +3004005006007~três trilhões quatro bilhões cinco milhões seis mil e sete +1000001~um milhão e um +1001100~um milhão mil e cem +1001110~um milhão mil cento e dez \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..1f9b59c69 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_decimal.txt @@ -0,0 +1,58 @@ +0,1~zero vírgula um +0,2~zero vírgula dois +0,5~zero vírgula cinco +0,9~zero vírgula nove +0,01~zero vírgula um +0,02~zero vírgula dois +0,05~zero vírgula cinco +0,10~zero vírgula dez +0,11~zero vírgula onze +0,15~zero vírgula quinze +0,20~zero vírgula vinte +0,25~zero vírgula vinte e cinco +0,50~zero vírgula cinquenta +0,99~zero vírgula noventa e nove +1,1~um vírgula um +1,2~um vírgula dois +1,5~um vírgula cinco +1,10~um vírgula dez +1,15~um vírgula quinze +1,20~um vírgula vinte +1,26~um vírgula vinte e seis +1,33~um vírgula trinta e três +1,50~um vírgula cinquenta +3,141~três vírgula cento e quarenta e um +3,256~três vírgula duzentos e cinquenta e seis +3,999~três vírgula novecentos e noventa e nove +3,1415~três vírgula mil quatrocentos e quinze +3,14159~três vírgula quatorze mil cento e cinquenta e nove +3,1001~três vírgula mil e um +3,2003~três vírgula dois mil e três +3,014~três vírgula quatorze +3,0141~três vírgula cento e quarenta e um +3,1005~três vírgula mil e cinco +3,1050~três vírgula mil e cinquenta +3,1010~três vírgula mil e dez +-1,2~menos um vírgula dois +-1,26~menos um vírgula vinte e seis +-3,5~menos três vírgula cinco +-0,5~menos zero vírgula cinco +1,2 milhões~um vírgula dois milhões +1,5 milhões~um vírgula cinco milhões +1,25 milhões~um vírgula vinte e cinco milhões +2,5 bilhões~dois vírgula cinco bilhões +3,75 bilhões~três vírgula setenta e cinco bilhões +0,001~zero vírgula um +0,0001~zero vírgula um +1,001~um vírgula mil e um +1,010~um vírgula mil e dez +1,100~um vírgula mil e cem +10,01~dez vírgula um +10,001~dez vírgula mil e um +100,5~cem vírgula cinco +100,05~cem vírgula cinco +3,14~três vírgula quatorze +3,141~três vírgula cento e quarenta e um +3,1415~três vírgula mil quatrocentos e quinze +3,14159~três vírgula quatorze mil cento e cinquenta e nove +3,1415926535~três vírgula um quatro um cinco nove dois seis cinco três cinco \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_fraction.txt new file mode 100644 index 000000000..256fecd86 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_fraction.txt @@ -0,0 +1,21 @@ +1/2~um meio +1/3~um terço +1/4~um quarto +2/3~dois terços +3/4~três quartos +1/5~um quinto +2/5~dois quintos +1/6~um sexto +5/6~cinco sextos +1/8~um oitavo +3/8~três oitavos +7/8~sete oitavos +1/10~um décimo +3/10~três décimos +3/11~três onze avos +5/13~cinco treze avos +1/100~um centésimo +1/1000~um milésimo +1 1/2~um e um meio +2 1/4~dois e um quarto +3 2/3~três e dois terços \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_ordinal.txt new file mode 100644 index 000000000..f9a58a9ce --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,39 @@ +1º~primeiro +2º~segundo +3º~terceiro +4º~quarto +5º~quinto +6º~sexto +7º~sétimo +8º~oitavo +9º~nono +10º~décimo +11º~décimo primeiro +12º~décimo segundo +13º~décimo terceiro +20º~vigésimo +21º~vigésimo primeiro +22º~vigésimo segundo +23º~vigésimo terceiro +100º~centésimo +111º~centésimo décimo primeiro +134º~centésimo trigésimo quarto +1ª~primeira +2ª~segunda +3ª~terceira +4ª~quarta +5ª~quinta +6ª~sexta +7ª~sétima +8ª~oitava +9ª~nona +10ª~décima +11ª~décima primeira +12ª~décima segunda +13ª~décima terceira +20ª~vigésima +21ª~vigésima primeira +22ª~vigésima segunda +23ª~vigésima terceira +100ª~centésima +11ª casa~décima primeira casa \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/test_cardinal.py b/tests/nemo_text_processing/pt/test_cardinal.py index dafa3e358..5271626ce 100644 --- a/tests/nemo_text_processing/pt/test_cardinal.py +++ b/tests/nemo_text_processing/pt/test_cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -30,3 +31,11 @@ class TestCardinal: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/test_decimal.py b/tests/nemo_text_processing/pt/test_decimal.py index afbec329b..bdba649d9 100644 --- a/tests/nemo_text_processing/pt/test_decimal.py +++ b/tests/nemo_text_processing/pt/test_decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -29,3 +30,11 @@ class TestDecimal: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/test_fraction.py b/tests/nemo_text_processing/pt/test_fraction.py new file mode 100644 index 000000000..16e6c5f30 --- /dev/null +++ b/tests/nemo_text_processing/pt/test_fraction.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use it except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestFraction: + normalizer = Normalizer( + lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased', post_process=True + ) + + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_fraction.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=False) + assert pred == expected, f"input: {test_input}" diff --git a/tests/nemo_text_processing/pt/test_ordinal.py b/tests/nemo_text_processing/pt/test_ordinal.py index a830e2d21..06b5cd15d 100644 --- a/tests/nemo_text_processing/pt/test_ordinal.py +++ b/tests/nemo_text_processing/pt/test_ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer - +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -30,3 +30,11 @@ class TestOrdinal: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected \ No newline at end of file From 9e4eecd649cbdd7fd62271d3dc83d96bfb36f358 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 13 Mar 2026 11:26:38 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/pt/__init__.py | 2 +- .../text_normalization/pt/graph_utils.py | 20 +-- .../text_normalization/pt/taggers/cardinal.py | 142 +++++++++--------- .../text_normalization/pt/taggers/decimal.py | 55 ++----- .../text_normalization/pt/taggers/fraction.py | 57 ++----- .../text_normalization/pt/taggers/ordinal.py | 18 +-- .../pt/taggers/tokenize_and_classify.py | 16 +- .../text_normalization/pt/utils.py | 1 - .../pt/verbalizers/cardinal.py | 10 +- .../pt/verbalizers/decimal.py | 23 +-- .../pt/verbalizers/fraction.py | 50 ++---- .../pt/verbalizers/ordinal.py | 31 +--- .../pt/verbalizers/verbalize.py | 2 +- .../pt/verbalizers/verbalize_final.py | 4 +- .../nemo_text_processing/pt/test_cardinal.py | 3 +- tests/nemo_text_processing/pt/test_decimal.py | 3 +- tests/nemo_text_processing/pt/test_ordinal.py | 3 +- 17 files changed, 152 insertions(+), 288 deletions(-) diff --git a/nemo_text_processing/text_normalization/pt/__init__.py b/nemo_text_processing/text_normalization/pt/__init__.py index 212acc24c..ffd13e2d6 100644 --- a/nemo_text_processing/text_normalization/pt/__init__.py +++ b/nemo_text_processing/text_normalization/pt/__init__.py @@ -10,4 +10,4 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/pt/graph_utils.py b/nemo_text_processing/text_normalization/pt/graph_utils.py index 0a2c29c0b..d2e6c9ec7 100644 --- a/nemo_text_processing/text_normalization/pt/graph_utils.py +++ b/nemo_text_processing/text_normalization/pt/graph_utils.py @@ -47,9 +47,7 @@ delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) insert_space = pynutil.insert(" ") -delete_extra_space = pynini.cross( - pynini.closure(NEMO_WHITE_SPACE, 1), " " -).optimize() +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ").optimize() def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]) -> None: @@ -84,13 +82,9 @@ def __init__(self, name: str, kind: str, deterministic: bool = True): self._fst = None self.deterministic = deterministic - self.far_path = Path( - os.path.dirname(os.path.abspath(__file__)) + "/grammars/" + kind + "/" + name + ".far" - ) + self.far_path = Path(os.path.dirname(os.path.abspath(__file__)) + "/grammars/" + kind + "/" + name + ".far") if self.far_exist(): - self._fst = Far( - self.far_path, mode="r", arc_type="standard", far_type="default" - ).get_fst() + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() def far_exist(self) -> bool: return self.far_path.exists() @@ -116,9 +110,7 @@ def delete_tokens(self, fst) -> "pynini.FstLike": + delete_space + pynutil.delete("}") ) - return res @ pynini.cdrewrite( - pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA - ) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) # ---- PT-specific (Brazilian: 1.000.000 or 1 000 000) ---- @@ -172,9 +164,7 @@ def shift_cardinal_gender_pt(fst: "pynini.FstLike") -> "pynini.FstLike": ) fem_hundreds = pynini.cdrewrite( pynini.cross("entos", "entas"), - pynini.union( - "duz", "trez", "quatroc", "quinh", "seisc", "setec", "oitoc", "novec" - ), + pynini.union("duz", "trez", "quatroc", "quinh", "seisc", "setec", "oitoc", "novec"), pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), NEMO_SIGMA, ) diff --git a/nemo_text_processing/text_normalization/pt/taggers/cardinal.py b/nemo_text_processing/text_normalization/pt/taggers/cardinal.py index 15969e6b4..a56057852 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/pt/taggers/cardinal.py @@ -26,8 +26,8 @@ NEMO_WHITE_SPACE, GraphFst, delete_space, - insert_space, filter_cardinal_punctuation, + insert_space, ) from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels @@ -48,9 +48,7 @@ def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) specials = { - row[0]: row[1] - for row in load_labels(get_abs_path("data/numbers/cardinal_specials.tsv")) - if len(row) >= 2 + row[0]: row[1] for row in load_labels(get_abs_path("data/numbers/cardinal_specials.tsv")) if len(row) >= 2 } connector_e = insert_space + pynutil.insert(specials["connector"]) + insert_space thousand = specials["thousand"] @@ -58,23 +56,21 @@ def __init__(self, deterministic: bool = True): hundred_1 = specials["hundred_1"] scale_rows = load_labels(get_abs_path("data/numbers/scales.tsv")) - scales = [ - (row[0], row[1], int(row[2])) - for row in scale_rows - if len(row) >= 3 and row[2].strip().isdigit() - ] + scales = [(row[0], row[1], int(row[2])) for row in scale_rows if len(row) >= 3 and row[2].strip().isdigit()] _num = lambda p: pynini.string_file(get_abs_path(f"data/numbers/{p}")) zero, digit, teens, tens, hundreds = ( - _num("zero.tsv"), _num("digit.tsv"), _num("teens.tsv"), _num("tens.tsv"), _num("hundreds.tsv") + _num("zero.tsv"), + _num("digit.tsv"), + _num("teens.tsv"), + _num("tens.tsv"), + _num("hundreds.tsv"), ) digits_no_one = (NEMO_DIGIT - "1") @ digit graph_tens = teens | (tens + (pynutil.delete("0") | (connector_e + digit))) self.tens = graph_tens.optimize() - self.two_digit_non_zero = pynini.union( - digit, graph_tens, (pynini.cross("0", NEMO_SPACE) + digit) - ).optimize() + self.two_digit_non_zero = pynini.union(digit, graph_tens, (pynini.cross("0", NEMO_SPACE) + digit)).optimize() graph_hundreds = hundreds + pynini.union( pynutil.delete("00"), @@ -109,7 +105,8 @@ def __init__(self, deterministic: bool = True): (connector_e + graph_tens), (connector_e + pynutil.delete("0") + digit), ), - hundreds + pynini.union( + hundreds + + pynini.union( (connector_e + graph_tens), (connector_e + digit), ), @@ -129,7 +126,9 @@ def __init__(self, deterministic: bool = True): ) t_comp_no_one = pynini.union( pynutil.delete("000") + h_comp_no_one, - h_comp_no_one + insert_space + pynutil.insert(thousand) + h_comp_no_one + + insert_space + + pynutil.insert(thousand) + ((insert_space + h_comp) | pynutil.delete("000")), pynini.cross("001", thousand) + ((insert_space + h_comp) | pynutil.delete("000")), ) @@ -154,8 +153,10 @@ def __init__(self, deterministic: bool = True): # Units 6 (u6): pure get "e" after scale; compound no "e" u6_one = pynini.cross("000001", "1") @ digit u6_pure = pynini.union( - u6_one, pynini.cross("001000", thousand), - pynini.cross("000010", "10") @ graph_tens, pynini.cross("000100", hundred_100), + u6_one, + pynini.cross("001000", thousand), + pynini.cross("000010", "10") @ graph_tens, + pynini.cross("000100", hundred_100), (pynini.cross("010000", "10") @ graph_tens) + insert_space + pynutil.insert(thousand), pynini.cross("100000", hundred_100) + insert_space + pynutil.insert(thousand), ) @@ -164,15 +165,22 @@ def __init__(self, deterministic: bool = True): z18 = pynini.accep("0" * 18) # 18 zeros: branch no "e" smaller_e = (connector_e + u6_pure) | u6_compound | pynutil.delete("0" * 6) smaller = u6 | pynutil.delete("0" * 6) - graph_24 = ( - ((NEMO_DIGIT**18 - z18) + NEMO_DIGIT**6) @ (graph_large_scales + smaller_e) - ) | ((z18 + NEMO_DIGIT**6) @ (pynutil.delete(z18) + smaller)) + graph_24 = (((NEMO_DIGIT**18 - z18) + NEMO_DIGIT**6) @ (graph_large_scales + smaller_e)) | ( + (z18 + NEMO_DIGIT**6) @ (pynutil.delete(z18) + smaller) + ) trail_by_z = {9: trail_9, 12: trail_12} magnitude_patterns = [ self._build_magnitude_pattern( - one_label, plural_suffix, magnitude_zeros, trail_by_z.get(magnitude_zeros), - connector_e, insert_space, digit, graph_tens, graph_hundreds, + one_label, + plural_suffix, + magnitude_zeros, + trail_by_z.get(magnitude_zeros), + connector_e, + insert_space, + digit, + graph_tens, + graph_hundreds, ) for one_label, plural_suffix, magnitude_zeros in scales if magnitude_zeros > 0 @@ -180,20 +188,17 @@ def __init__(self, deterministic: bool = True): pad = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 0) pad = pad @ pynini.cdrewrite(pynini.closure(pynutil.insert("0")), "[BOS]", "", NEMO_SIGMA) @ NEMO_DIGIT**24 - norm = pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite(delete_space, "", "[EOS]", NEMO_SIGMA) - norm = norm @ pynini.cdrewrite(pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 2), NEMO_SPACE), NEMO_ALPHA, NEMO_ALPHA, NEMO_SIGMA) + norm = pynini.cdrewrite(delete_space, "[BOS]", "", NEMO_SIGMA) @ pynini.cdrewrite( + delete_space, "", "[EOS]", NEMO_SIGMA + ) + norm = norm @ pynini.cdrewrite( + pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 2), NEMO_SPACE), NEMO_ALPHA, NEMO_ALPHA, NEMO_SIGMA + ) self.graph = reduce(lambda a, b: a | b, magnitude_patterns, pad @ graph_24 @ norm) | zero self.graph = filter_cardinal_punctuation(self.graph).optimize() - optional_minus_graph = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1 - ) - final_graph = ( - optional_minus_graph - + pynutil.insert("integer: \"") - + self.graph - + pynutil.insert("\"") - ) + optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) + final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() @@ -221,13 +226,19 @@ def _build_scale_trailing_graph(self, scale_3, sub_graph, trailing_len, total_le @staticmethod def _pure_inputs(num_digits): """Inputs 1, 10, 100, ... as num_digits-digit strings.""" - return pynini.union( - *[pynini.accep(str(10**k).zfill(num_digits)) for k in range(0, num_digits)] - ) + return pynini.union(*[pynini.accep(str(10**k).zfill(num_digits)) for k in range(0, num_digits)]) def _magnitude_graph( - self, one_word, plural_suffix, zero_count, graph_digit, graph_tens, graph_hundreds, - connector_e, insert_space, trailing_pair=None, + self, + one_word, + plural_suffix, + zero_count, + graph_digit, + graph_tens, + graph_hundreds, + connector_e, + insert_space, + trailing_pair=None, ): """Round (1–3 digit + scale + zeros); optional trailing (e + pure | space + compound).""" zeros = "0" * zero_count @@ -236,54 +247,51 @@ def _magnitude_graph( for L in (1, 2, 3): total = zero_count + L if L == 1: - lead = pynini.cross("1", one_word) | ( - (NEMO_DIGIT - "1") @ graph_digit + pynutil.insert(plural_suffix) - ) + lead = pynini.cross("1", one_word) | ((NEMO_DIGIT - "1") @ graph_digit + pynutil.insert(plural_suffix)) else: - lead = ( - pynini.closure(NEMO_DIGIT, L, L) - @ (graph_tens if L == 2 else graph_hundreds) - + pynutil.insert(plural_suffix) + lead = pynini.closure(NEMO_DIGIT, L, L) @ (graph_tens if L == 2 else graph_hundreds) + pynutil.insert( + plural_suffix ) lead_fst = NEMO_DIGIT**L @ lead - round_pats.append( - pynini.closure(NEMO_DIGIT, total, total) @ (lead_fst + pynutil.delete(zeros)) - ) + round_pats.append(pynini.closure(NEMO_DIGIT, total, total) @ (lead_fst + pynutil.delete(zeros))) if trailing_pair: pure, compound = trailing_pair - trail_part = ( - NEMO_DIGIT**zero_count @ (connector_e + pure) - | NEMO_DIGIT**zero_count @ (insert_space + compound) - ) - trail_pats.append( - pynini.closure(NEMO_DIGIT, total, total) @ (lead_fst + trail_part) + trail_part = NEMO_DIGIT**zero_count @ (connector_e + pure) | NEMO_DIGIT**zero_count @ ( + insert_space + compound ) + trail_pats.append(pynini.closure(NEMO_DIGIT, total, total) @ (lead_fst + trail_part)) graph_round = pynini.union(*round_pats) graph_trail = pynini.union(*trail_pats) if trail_pats else None return graph_round, graph_trail def _build_magnitude_pattern( self, - one_label, plural_suffix, magnitude_zeros, + one_label, + plural_suffix, + magnitude_zeros, trailing_pair, - connector_e, insert_space, - graph_digit, graph_tens, graph_hundreds, + connector_e, + insert_space, + graph_digit, + graph_tens, + graph_hundreds, ): """Restrict length; round + optional non-zero trailing.""" - restrict = (NEMO_DIGIT - "0") + pynini.closure( - NEMO_DIGIT, magnitude_zeros, magnitude_zeros + 2 - ) + restrict = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, magnitude_zeros, magnitude_zeros + 2) graph_round, graph_trail = self._magnitude_graph( - one_label, plural_suffix, magnitude_zeros, - graph_digit, graph_tens, graph_hundreds, - connector_e, insert_space, trailing_pair, + one_label, + plural_suffix, + magnitude_zeros, + graph_digit, + graph_tens, + graph_hundreds, + connector_e, + insert_space, + trailing_pair, ) if graph_trail is None: return pynutil.add_weight(restrict @ graph_round, -1.0) non_zero_trail = pynini.union( - *[ - NEMO_DIGIT**n + (NEMO_DIGIT**magnitude_zeros - pynini.accep("0" * magnitude_zeros)) - for n in (1, 2, 3) - ] + *[NEMO_DIGIT**n + (NEMO_DIGIT**magnitude_zeros - pynini.accep("0" * magnitude_zeros)) for n in (1, 2, 3)] ) return pynutil.add_weight(restrict @ (graph_round | (non_zero_trail @ graph_trail)), -1.0) diff --git a/nemo_text_processing/text_normalization/pt/taggers/decimal.py b/nemo_text_processing/text_normalization/pt/taggers/decimal.py index 440c451c3..c03fc7770 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/pt/taggers/decimal.py @@ -16,12 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.pt.graph_utils import ( - NEMO_DIGIT, - GraphFst, - insert_space, - delete_space -) +from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_DIGIT, GraphFst, delete_space, insert_space from nemo_text_processing.text_normalization.pt.utils import get_abs_path @@ -43,9 +38,7 @@ class DecimalFst(GraphFst): def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph - _num = lambda name: pynini.string_file( - get_abs_path(f"data/numbers/{name}") - ).optimize() + _num = lambda name: pynini.string_file(get_abs_path(f"data/numbers/{name}")).optimize() comma = pynutil.delete(",") quantity_words = _num("quantity_words.tsv") @@ -53,22 +46,13 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): zero = _num("zero.tsv") fractional_specials = _num("decimal_fractional_specials.tsv") graph_digit_or_zero = pynini.union(digit, zero) - digit_by_digit = ( - graph_digit_or_zero - + pynini.closure(insert_space + graph_digit_or_zero) - ).optimize() + digit_by_digit = (graph_digit_or_zero + pynini.closure(insert_space + graph_digit_or_zero)).optimize() # Fractional: strip leading zeros → rest @ cardinal; all zeros → "zero" delete_leading_zero = pynini.cross("0", "") - rest = pynini.difference(NEMO_DIGIT, pynini.accep("0")) + pynini.closure( - NEMO_DIGIT, 0 - ) - with_rest = (pynini.closure(delete_leading_zero, 0) + rest) @ ( - pynini.closure(NEMO_DIGIT, 1) @ cardinal_graph - ) - only_zeros = pynini.closure(delete_leading_zero, 1) + pynini.cross( - "0", "zero" - ) + rest = pynini.difference(NEMO_DIGIT, pynini.accep("0")) + pynini.closure(NEMO_DIGIT, 0) + with_rest = (pynini.closure(delete_leading_zero, 0) + rest) @ (pynini.closure(NEMO_DIGIT, 1) @ cardinal_graph) + only_zeros = pynini.closure(delete_leading_zero, 1) + pynini.cross("0", "zero") fractional_strip = pynini.union(with_rest, only_zeros).optimize() # Prefer specials (001→mil e um, 010→mil e dez, 100→mil e cem) over strip when both match fractional_with_specials = pynini.union( @@ -82,10 +66,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): # Integer "0" → fractional strip only (no specials) graph_integer_zero = ( - pynutil.insert('integer_part: "') - + pynini.cross("0", "zero") - + pynutil.insert('"') - + insert_space + pynutil.insert('integer_part: "') + pynini.cross("0", "zero") + pynutil.insert('"') + insert_space ) graph_fractional_zero = ( pynutil.insert('fractional_part: "') @@ -95,9 +76,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) + pynutil.insert('"') ) - decimal_when_zero = ( - graph_integer_zero + comma + insert_space + graph_fractional_zero - ) + decimal_when_zero = graph_integer_zero + comma + insert_space + graph_fractional_zero # Integer non-zero → fractional: specials | strip + cardinal | digit-by-digit graph_integer_pos = ( @@ -114,9 +93,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) + pynutil.insert('"') ) - decimal_when_pos = ( - graph_integer_pos + comma + insert_space + graph_fractional_pos - ) + decimal_when_pos = graph_integer_pos + comma + insert_space + graph_fractional_pos decimal_core = pynini.union(decimal_when_zero, decimal_when_pos) integer_quantity = ( @@ -130,18 +107,10 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + pynutil.insert('"') ) decimal_quantity = ( - decimal_core - + delete_space - + pynutil.insert('quantity: "') - + quantity_words - + pynutil.insert('"') - ) - final_graph_wo_sign = pynini.union( - decimal_core, integer_quantity, decimal_quantity - ) - optional_minus = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", '"true" '), 0, 1 + decimal_core + delete_space + pynutil.insert('quantity: "') + quantity_words + pynutil.insert('"') ) + final_graph_wo_sign = pynini.union(decimal_core, integer_quantity, decimal_quantity) + optional_minus = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", '"true" '), 0, 1) final_graph = optional_minus + final_graph_wo_sign self.fst = self.add_tokens(final_graph).optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/fraction.py b/nemo_text_processing/text_normalization/pt/taggers/fraction.py index 4bfe84087..b5a206ff0 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/pt/taggers/fraction.py @@ -19,8 +19,8 @@ NEMO_DIGIT, NEMO_SIGMA, NEMO_WHITE_SPACE, - insert_space, GraphFst, + insert_space, ) from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels @@ -44,34 +44,22 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = cardinal_graph = cardinal.graph # Denominators 2–10 use ordinal form (no data file: fixed set) - two_to_ten = pynini.union( - *[pynini.accep(str(d)) for d in range(2, 11)] - ).optimize() + two_to_ten = pynini.union(*[pynini.accep(str(d)) for d in range(2, 11)]).optimize() ord_digit_rows = load_labels(get_abs_path("data/ordinals/digit.tsv")) - ordinal_digit = pynini.string_map( - [(r[1], r[0]) for r in ord_digit_rows if len(r) >= 2] - ).optimize() + ordinal_digit = pynini.string_map([(r[1], r[0]) for r in ord_digit_rows if len(r) >= 2]).optimize() ord_exc_rows = load_labels(get_abs_path("data/fractions/ordinal_exceptions.tsv")) - ordinal_exceptions = pynini.string_map( - [(r[0], r[1]) for r in ord_exc_rows if len(r) >= 2] - ).optimize() + ordinal_exceptions = pynini.string_map([(r[0], r[1]) for r in ord_exc_rows if len(r) >= 2]).optimize() ord_hundreds_rows = load_labels(get_abs_path("data/ordinals/hundreds.tsv")) - ordinal_hundreds = pynini.string_map( - [(r[1], r[0]) for r in ord_hundreds_rows if len(r) >= 2] - ).optimize() + ordinal_hundreds = pynini.string_map([(r[1], r[0]) for r in ord_hundreds_rows if len(r) >= 2]).optimize() powers_rows = load_labels(get_abs_path("data/fractions/powers_of_ten.tsv")) - powers_of_ten = pynini.string_map( - [(r[0], r[1]) for r in powers_rows if len(r) >= 2] - ).optimize() + powers_of_ten = pynini.string_map([(r[0], r[1]) for r in powers_rows if len(r) >= 2]).optimize() denom_ordinal_form = two_to_ten @ cardinal_graph @ ordinal_digit - denom_ordinal_form = denom_ordinal_form @ pynini.cdrewrite( - ordinal_exceptions, "", "", NEMO_SIGMA - ) + denom_ordinal_form = denom_ordinal_form @ pynini.cdrewrite(ordinal_exceptions, "", "", NEMO_SIGMA) denom_ordinal = ( pynutil.insert('denominator: "') + denom_ordinal_form @@ -89,9 +77,7 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = + pynutil.insert('" morphosyntactic_features: "ordinal"') ) - denom_ordinal_2_10_100_1000 = pynini.union( - denom_ordinal, denom_100, denom_1000 - ) + denom_ordinal_2_10_100_1000 = pynini.union(denom_ordinal, denom_100, denom_1000) digit_plus = pynini.closure(NEMO_DIGIT, 1) denom_avos_input = pynini.difference( digit_plus, @@ -113,34 +99,19 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = slash_or_space_slash = pynini.union( pynini.cross("/", '" '), pynini.cross(" / ", '" '), - pynini.cross("\u2044", '" '), # fraction slash ⁄ + pynini.cross("\u2044", '" '), # fraction slash ⁄ pynini.cross(" \u2044 ", '" '), - pynini.cross("\u2215", '" '), # division slash ∕ + pynini.cross("\u2215", '" '), # division slash ∕ pynini.cross(" \u2215 ", '" '), ) - numerator = ( - pynutil.insert('numerator: "') - + cardinal_graph - + slash_or_space_slash - ) + numerator = pynutil.insert('numerator: "') + cardinal_graph + slash_or_space_slash fraction_core = numerator + denominator - integer_part = ( - pynutil.insert('integer_part: "') - + cardinal_graph - + pynutil.insert('"') - + insert_space - ) + integer_part = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + insert_space - optional_minus = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", '"true" '), 0, 1 - ) + optional_minus = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", '"true" '), 0, 1) - mixed = ( - integer_part - + pynini.closure(NEMO_WHITE_SPACE, 1) - + fraction_core - ) + mixed = integer_part + pynini.closure(NEMO_WHITE_SPACE, 1) + fraction_core graph = optional_minus + pynini.union(mixed, fraction_core) self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/ordinal.py b/nemo_text_processing/text_normalization/pt/taggers/ordinal.py index 7f5a62d9d..de04269fb 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/pt/taggers/ordinal.py @@ -16,10 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.pt.graph_utils import ( - NEMO_DIGIT, - GraphFst, -) +from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_DIGIT, GraphFst from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels @@ -60,10 +57,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): graph_ties + pynini.closure(conn + graph_digit, 0, 1), ) graph_hundred_component = pynini.union( - graph_hundreds - + pynini.closure( - conn + pynini.union(graph_tens, graph_digit), 0, 1 - ), + graph_hundreds + pynini.closure(conn + pynini.union(graph_tens, graph_digit), 0, 1), graph_tens, graph_digit, ) @@ -84,10 +78,6 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + pynutil.insert('" morphosyntactic_features: "gender_masc"') ) graph_fem = ( - pynutil.insert('integer: "') - + to_ordinal_fem - + pynutil.insert('" morphosyntactic_features: "gender_fem"') + pynutil.insert('integer: "') + to_ordinal_fem + pynutil.insert('" morphosyntactic_features: "gender_fem"') ) - self.fst = self.add_tokens( - pynini.union(graph_masc, graph_fem) - ).optimize() + self.fst = self.add_tokens(pynini.union(graph_masc, graph_fem)).optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py index 686fb8f6b..3c196df50 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py @@ -18,6 +18,9 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.en.taggers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.en.taggers.word import WordFst from nemo_text_processing.text_normalization.pt.graph_utils import ( NEMO_WHITE_SPACE, GraphFst, @@ -25,13 +28,10 @@ delete_space, generator_main, ) -from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst from nemo_text_processing.text_normalization.pt.taggers.cardinal import CardinalFst from nemo_text_processing.text_normalization.pt.taggers.decimal import DecimalFst from nemo_text_processing.text_normalization.pt.taggers.fraction import FractionFst from nemo_text_processing.text_normalization.pt.taggers.ordinal import OrdinalFst -from nemo_text_processing.text_normalization.en.taggers.whitelist import WhiteListFst -from nemo_text_processing.text_normalization.en.taggers.word import WordFst from nemo_text_processing.utils.logging import logger @@ -94,13 +94,15 @@ def __init__( # Wrap tokens properly token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") - punct_graph = pynutil.insert("tokens { ") + pynutil.add_weight(punctuation.fst, weight=2.1) + pynutil.insert(" }") - + punct_graph = ( + pynutil.insert("tokens { ") + pynutil.add_weight(punctuation.fst, weight=2.1) + pynutil.insert(" }") + ) + # Simple graph structure graph = token + pynini.closure( pynini.compose(pynini.closure(NEMO_WHITE_SPACE, 1), delete_extra_space) + token ) - + # Allow punctuation graph |= punct_graph @@ -112,4 +114,4 @@ def __init__( if __name__ == "__main__": - ClassifyFst(input_case="cased", deterministic=False) \ No newline at end of file + ClassifyFst(input_case="cased", deterministic=False) diff --git a/nemo_text_processing/text_normalization/pt/utils.py b/nemo_text_processing/text_normalization/pt/utils.py index e6e0c51fd..da4be3f89 100644 --- a/nemo_text_processing/text_normalization/pt/utils.py +++ b/nemo_text_processing/text_normalization/pt/utils.py @@ -47,4 +47,3 @@ def load_labels(abs_path: str): with open(abs_path, encoding="utf-8") as label_tsv: labels = list(csv.reader(label_tsv, delimiter="\t")) return labels - diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/pt/verbalizers/cardinal.py index b05d2c8ca..f735d95e3 100644 --- a/nemo_text_processing/text_normalization/pt/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/pt/verbalizers/cardinal.py @@ -16,11 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.pt.graph_utils import ( - NEMO_NOT_QUOTE, - GraphFst, - shift_cardinal_gender_pt, -) +from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_NOT_QUOTE, GraphFst, shift_cardinal_gender_pt class CardinalFst(GraphFst): @@ -47,7 +43,7 @@ def __init__(self, deterministic: bool = True): # Generate masculine form (default) graph_masc = optional_sign + integer - + # Generate feminine form using Portuguese gender conversion graph_fem = shift_cardinal_gender_pt(graph_masc) @@ -64,4 +60,4 @@ def __init__(self, deterministic: bool = True): pass delete_tokens = self.delete_tokens(graph) - self.fst = delete_tokens.optimize() \ No newline at end of file + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/decimal.py b/nemo_text_processing/text_normalization/pt/verbalizers/decimal.py index cad0656e6..eea1e4d17 100644 --- a/nemo_text_processing/text_normalization/pt/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/pt/verbalizers/decimal.py @@ -15,12 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.pt.graph_utils import ( - NEMO_NOT_QUOTE, - GraphFst, - delete_space, - insert_space, -) +from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels @@ -43,20 +38,10 @@ def __init__(self, deterministic: bool = True): sep = spec.get("separator", "vírgula") minus = spec.get("minus", "menos") - optional_sign = pynini.closure( - pynini.cross('negative: "true" ', minus + " "), 0, 1 - ) + optional_sign = pynini.closure(pynini.cross('negative: "true" ', minus + " "), 0, 1) - integer = ( - pynutil.delete('integer_part: "') - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete('"') - ) - fractional = ( - pynutil.delete('fractional_part: "') - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete('"') - ) + integer = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + fractional = pynutil.delete('fractional_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') quantity = ( delete_space + insert_space diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/fraction.py b/nemo_text_processing/text_normalization/pt/verbalizers/fraction.py index a9ac5b2bd..f289b4865 100644 --- a/nemo_text_processing/text_normalization/pt/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/pt/verbalizers/fraction.py @@ -15,11 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.pt.graph_utils import ( - NEMO_NOT_QUOTE, - GraphFst, - insert_space, -) +from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_NOT_QUOTE, GraphFst, insert_space from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels @@ -46,26 +42,14 @@ def __init__(self, deterministic: bool = True): numerator_one_val = spec.get("numerator_one", "um") denominator_half_val = spec.get("denominator_half", "meio") - optional_sign = pynini.closure( - pynini.cross('negative: "true" ', minus) + insert_space, 0, 1 - ) + optional_sign = pynini.closure(pynini.cross('negative: "true" ', minus) + insert_space, 0, 1) - integer = ( - pynutil.delete('integer_part: "') - + pynini.closure(NEMO_NOT_QUOTE) - + pynutil.delete('" ') - ) + integer = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete('" ') - numerator_one = ( - pynutil.delete('numerator: "') - + pynini.accep(numerator_one_val) - + pynutil.delete('" ') - ) + numerator_one = pynutil.delete('numerator: "') + pynini.accep(numerator_one_val) + pynutil.delete('" ') numerator_rest = ( pynutil.delete('numerator: "') - + pynini.difference( - pynini.closure(NEMO_NOT_QUOTE), pynini.accep(numerator_one_val) - ) + + pynini.difference(pynini.closure(NEMO_NOT_QUOTE), pynini.accep(numerator_one_val)) + pynutil.delete('" ') ) @@ -86,30 +70,16 @@ def __init__(self, deterministic: bool = True): ) fraction_ordinal_singular = numerator_one + insert_space + denom_ordinal - fraction_ordinal_plural = ( - numerator_rest + insert_space + denom_ordinal + pynutil.insert(plural_suffix) - ) - fraction_ordinal = pynini.union( - fraction_ordinal_singular, fraction_ordinal_plural - ) + fraction_ordinal_plural = numerator_rest + insert_space + denom_ordinal + pynutil.insert(plural_suffix) + fraction_ordinal = pynini.union(fraction_ordinal_singular, fraction_ordinal_plural) fraction_avos = ( - pynini.union(numerator_one, numerator_rest) - + insert_space - + denom_avos - + pynutil.insert(avos_suffix) + pynini.union(numerator_one, numerator_rest) + insert_space + denom_avos + pynutil.insert(avos_suffix) ) fraction = pynini.union(fraction_ordinal, fraction_avos) - mixed_um_meio = ( - integer - + connector - + pynutil.delete('numerator: "' + numerator_one_val + '" " ') - + denom_meio - ) - optional_integer = pynini.closure( - integer + connector + insert_space, 0, 1 - ) + mixed_um_meio = integer + connector + pynutil.delete('numerator: "' + numerator_one_val + '" " ') + denom_meio + optional_integer = pynini.closure(integer + connector + insert_space, 0, 1) graph = optional_sign + pynini.union( pynutil.add_weight(mixed_um_meio, -0.01), optional_integer + fraction, diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/pt/verbalizers/ordinal.py index c495d0328..9be8876fd 100644 --- a/nemo_text_processing/text_normalization/pt/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/pt/verbalizers/ordinal.py @@ -15,12 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.pt.graph_utils import ( - NEMO_NOT_QUOTE, - NEMO_SIGMA, - NEMO_SPACE, - GraphFst, -) +from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.pt.utils import get_abs_path @@ -37,15 +32,9 @@ class OrdinalFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic) - integer = ( - pynutil.delete('integer: "') - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete('"') - ) + integer = pynutil.delete('integer: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') - fem_rewrite = pynini.string_file( - get_abs_path("data/ordinals/feminine.tsv") - ) + fem_rewrite = pynini.string_file(get_abs_path("data/ordinals/feminine.tsv")) feminine_rewrite = pynini.cdrewrite( fem_rewrite, "", @@ -53,14 +42,6 @@ def __init__(self, deterministic: bool = True): NEMO_SIGMA, ) - graph_masc = ( - integer - + pynutil.delete(' morphosyntactic_features: "gender_masc"') - ) - graph_fem = ( - (integer @ feminine_rewrite) - + pynutil.delete(' morphosyntactic_features: "gender_fem"') - ) - self.fst = self.delete_tokens( - pynini.union(graph_masc, graph_fem) - ).optimize() + graph_masc = integer + pynutil.delete(' morphosyntactic_features: "gender_masc"') + graph_fem = (integer @ feminine_rewrite) + pynutil.delete(' morphosyntactic_features: "gender_fem"') + self.fst = self.delete_tokens(pynini.union(graph_masc, graph_fem)).optimize() diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py index 4fa03465b..76f2a032a 100644 --- a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py @@ -40,4 +40,4 @@ def __init__(self, deterministic: bool = True): decimal = DecimalFst(deterministic=deterministic) graph = fraction.fst | decimal.fst | ordinal.fst | cardinal.fst - self.fst = graph \ No newline at end of file + self.fst = graph diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize_final.py index 5b6488bd1..cc2eaae3d 100644 --- a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize_final.py @@ -17,13 +17,13 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.pt.graph_utils import ( GraphFst, delete_extra_space, delete_space, generator_main, ) -from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.pt.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.utils.logging import logger @@ -68,4 +68,4 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ self.fst = graph.optimize() if far_file: - generator_main(far_file, {"verbalize": self.fst}) \ No newline at end of file + generator_main(far_file, {"verbalize": self.fst}) diff --git a/tests/nemo_text_processing/pt/test_cardinal.py b/tests/nemo_text_processing/pt/test_cardinal.py index 5271626ce..dfadad09f 100644 --- a/tests/nemo_text_processing/pt/test_cardinal.py +++ b/tests/nemo_text_processing/pt/test_cardinal.py @@ -33,9 +33,10 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_cardinal.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_decimal.py b/tests/nemo_text_processing/pt/test_decimal.py index bdba649d9..67376d476 100644 --- a/tests/nemo_text_processing/pt/test_decimal.py +++ b/tests/nemo_text_processing/pt/test_decimal.py @@ -32,9 +32,10 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_decimal.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_ordinal.py b/tests/nemo_text_processing/pt/test_ordinal.py index 06b5cd15d..8602e8700 100644 --- a/tests/nemo_text_processing/pt/test_ordinal.py +++ b/tests/nemo_text_processing/pt/test_ordinal.py @@ -32,9 +32,10 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_ordinal.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file + assert pred == expected From 81eb5b523f3bc58fd82f88d3f3551b34344ee570 Mon Sep 17 00:00:00 2001 From: Mai Anh Date: Sat, 4 Apr 2026 02:33:21 +0700 Subject: [PATCH 3/4] date and time semiotic classese --- .../pt/data/date/__init__.py | 13 ++ .../pt/data/date/months.tsv | 21 ++ .../pt/data/date/numeric_separators.tsv | 3 + .../pt/data/date/verbal_phrases.tsv | 1 + .../pt/data/fractions/__init__.py | 13 ++ .../pt/data/numbers/teens.tsv | 2 +- .../pt/data/time/__init__.py | 13 ++ .../pt/data/time/day_period_suffix.tsv | 4 + .../text_normalization/pt/graph_utils.py | 4 + .../text_normalization/pt/taggers/cardinal.py | 6 +- .../text_normalization/pt/taggers/date.py | 186 ++++++++++++++++++ .../text_normalization/pt/taggers/time.py | 144 ++++++++++++++ .../pt/taggers/tokenize_and_classify.py | 6 + .../text_normalization/pt/verbalizers/date.py | 52 +++++ .../text_normalization/pt/verbalizers/time.py | 84 ++++++++ .../pt/verbalizers/verbalize.py | 13 +- .../test_cases_abbreviation.txt | 46 +++++ .../test_cases_address.txt | 32 +++ .../test_cases_cardinal.txt | 1 - .../test_cases_date.txt | 26 +++ .../test_cases_decimal.txt | 7 +- .../test_cases_electronic.txt | 27 +++ .../test_cases_measure.txt | 33 ++++ .../test_cases_money.txt | 40 ++++ .../test_cases_punctuation.txt | 36 ++++ .../test_cases_range.txt | 29 +++ .../test_cases_roman.txt | 40 ++++ .../test_cases_serial.txt | 27 +++ .../test_cases_telephone.txt | 28 +++ .../test_cases_time.txt | 24 +++ .../test_cases_whitelist.txt | 19 ++ .../test_cases_word.txt | 34 ++++ .../nemo_text_processing/pt/test_cardinal.py | 2 +- tests/nemo_text_processing/pt/test_date.py | 9 + tests/nemo_text_processing/pt/test_decimal.py | 2 +- .../nemo_text_processing/pt/test_fraction.py | 2 +- tests/nemo_text_processing/pt/test_ordinal.py | 2 +- .../pt/test_sparrowhawk_normalization.sh | 86 ++++++++ tests/nemo_text_processing/pt/test_time.py | 10 +- 39 files changed, 1113 insertions(+), 14 deletions(-) create mode 100644 nemo_text_processing/text_normalization/pt/data/date/__init__.py create mode 100644 nemo_text_processing/text_normalization/pt/data/date/months.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/date/numeric_separators.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/date/verbal_phrases.tsv create mode 100644 nemo_text_processing/text_normalization/pt/data/fractions/__init__.py create mode 100644 nemo_text_processing/text_normalization/pt/data/time/__init__.py create mode 100644 nemo_text_processing/text_normalization/pt/data/time/day_period_suffix.tsv create mode 100644 nemo_text_processing/text_normalization/pt/taggers/date.py create mode 100644 nemo_text_processing/text_normalization/pt/taggers/time.py create mode 100644 nemo_text_processing/text_normalization/pt/verbalizers/date.py create mode 100644 nemo_text_processing/text_normalization/pt/verbalizers/time.py create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_abbreviation.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_address.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_date.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_electronic.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_measure.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_money.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_punctuation.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_range.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_roman.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_serial.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_telephone.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_time.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_whitelist.txt create mode 100644 tests/nemo_text_processing/pt/data_text_normalization/test_cases_word.txt create mode 100755 tests/nemo_text_processing/pt/test_sparrowhawk_normalization.sh diff --git a/nemo_text_processing/text_normalization/pt/data/date/__init__.py b/nemo_text_processing/text_normalization/pt/data/date/__init__.py new file mode 100644 index 000000000..9e3fb699d --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/date/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/pt/data/date/months.tsv b/nemo_text_processing/text_normalization/pt/data/date/months.tsv new file mode 100644 index 000000000..6713229bd --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/date/months.tsv @@ -0,0 +1,21 @@ +1 janeiro +01 janeiro +2 fevereiro +02 fevereiro +3 março +03 março +4 abril +04 abril +5 maio +05 maio +6 junho +06 junho +7 julho +07 julho +8 agosto +08 agosto +9 setembro +09 setembro +10 outubro +11 novembro +12 dezembro diff --git a/nemo_text_processing/text_normalization/pt/data/date/numeric_separators.tsv b/nemo_text_processing/text_normalization/pt/data/date/numeric_separators.tsv new file mode 100644 index 000000000..ee24567bc --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/date/numeric_separators.tsv @@ -0,0 +1,3 @@ +/ +. +- diff --git a/nemo_text_processing/text_normalization/pt/data/date/verbal_phrases.tsv b/nemo_text_processing/text_normalization/pt/data/date/verbal_phrases.tsv new file mode 100644 index 000000000..d04c0fa50 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/date/verbal_phrases.tsv @@ -0,0 +1 @@ +preposition de diff --git a/nemo_text_processing/text_normalization/pt/data/fractions/__init__.py b/nemo_text_processing/text_normalization/pt/data/fractions/__init__.py new file mode 100644 index 000000000..9e3fb699d --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/fractions/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/pt/data/numbers/teens.tsv b/nemo_text_processing/text_normalization/pt/data/numbers/teens.tsv index 299c3dbf2..50c4e0b8d 100644 --- a/nemo_text_processing/text_normalization/pt/data/numbers/teens.tsv +++ b/nemo_text_processing/text_normalization/pt/data/numbers/teens.tsv @@ -2,7 +2,7 @@ 11 onze 12 doze 13 treze -14 quatorze +14 catorze 15 quinze 16 dezesseis 17 dezessete diff --git a/nemo_text_processing/text_normalization/pt/data/time/__init__.py b/nemo_text_processing/text_normalization/pt/data/time/__init__.py new file mode 100644 index 000000000..9e3fb699d --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/pt/data/time/day_period_suffix.tsv b/nemo_text_processing/text_normalization/pt/data/time/day_period_suffix.tsv new file mode 100644 index 000000000..93a6d9086 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/time/day_period_suffix.tsv @@ -0,0 +1,4 @@ +manhã da manhã +tarde da tarde +noite da noite +madrugada da madrugada diff --git a/nemo_text_processing/text_normalization/pt/graph_utils.py b/nemo_text_processing/text_normalization/pt/graph_utils.py index d2e6c9ec7..0b41ba197 100644 --- a/nemo_text_processing/text_normalization/pt/graph_utils.py +++ b/nemo_text_processing/text_normalization/pt/graph_utils.py @@ -49,6 +49,10 @@ insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ").optimize() +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") +) + def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]) -> None: """ diff --git a/nemo_text_processing/text_normalization/pt/taggers/cardinal.py b/nemo_text_processing/text_normalization/pt/taggers/cardinal.py index a56057852..393aabc0e 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/pt/taggers/cardinal.py @@ -77,9 +77,10 @@ def __init__(self, deterministic: bool = True): (connector_e + graph_tens), (connector_e + digit), ) + # "100" -> cem only (cross("1", cento)+delete("00") would also match "100" but + # yields "cento"; OpenFst vs pynini top_rewrite can disagree on ties — Sparrowhawk). graph_hundreds |= pynini.cross("100", hundred_100) graph_hundreds |= pynini.cross("1", hundred_1) + pynini.union( - pynutil.delete("00"), (connector_e + graph_tens), (connector_e + pynutil.delete("0") + digit), ) @@ -116,7 +117,8 @@ def __init__(self, deterministic: bool = True): pynutil.delete("000"), (connector_e + graph_pure_components), (insert_space + graph_compound_hundreds), - (insert_space + pynutil.delete("0") + graph_compound_tens), + # Use connector_e so "2024" -> dois mil e vinte e quatro (not dois mil vinte e quatro). + (connector_e + pynutil.delete("0") + graph_compound_tens), ) t_comp = pynini.union( diff --git a/nemo_text_processing/text_normalization/pt/taggers/date.py b/nemo_text_processing/text_normalization/pt/taggers/date.py new file mode 100644 index 000000000..69c63aedd --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/taggers/date.py @@ -0,0 +1,186 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_DIGIT, + NEMO_SIGMA, + GraphFst, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying Portuguese (Brazilian) dates, e.g. + 15/03/2024 -> date { day: "quinze" month: "março" year: "dois mil e vinte e quatro" preserve_order: true } + 15 de março de 2024 -> date { day: "quinze" month: "março" year: "dois mil e vinte e quatro" preserve_order: true } + 2024-03-15 -> date { day: "quinze" month: "março" year: "dois mil e vinte e quatro" preserve_order: true } + 03/15/2024 -> date { day: "quinze" month: "março" year: "dois mil e vinte e quatro" preserve_order: true } + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="date", kind="classify", deterministic=deterministic) + numbers = cardinal.graph + + month_rows = load_labels(get_abs_path("data/date/months.tsv")) + month_pairs = [(r[0], r[1]) for r in month_rows if len(r) >= 2] + month_to_word = pynini.string_map(month_pairs).optimize() + + day_10_31 = ( + (NEMO_DIGIT - "0") + NEMO_DIGIT + ) @ pynini.union(*[str(x) for x in range(10, 32)]) @ numbers + day_02_09 = pynutil.delete("0") + ( + pynini.union(*[str(x) for x in range(2, 10)]) @ numbers + ) + day_2_9 = pynini.union(*[str(x) for x in range(2, 10)]) @ numbers + day_inner = pynini.union( + pynini.cross("01", "primeiro"), + day_10_31, + day_02_09, + day_2_9, + pynini.cross("1", "primeiro"), + ).optimize() + day_part = pynutil.insert('day: "') + day_inner + pynutil.insert('"') + + month_digits = ( + pynini.union("10", "11", "12") + | pynutil.delete("0") + pynini.union(*[str(x) for x in range(1, 10)]) + | pynini.union(*[str(x) for x in range(1, 10)]) + ) + month_num = month_digits @ month_to_word + month_part = pynutil.insert('month: "') + month_num + pynutil.insert('"') + + year_num = ((NEMO_DIGIT - "0") + NEMO_DIGIT**3) @ numbers + year_part = pynutil.insert('year: "') + year_num + pynutil.insert('"') + + preserve = pynutil.insert(" preserve_order: true") + + delete_de = delete_space + pynutil.delete("de") + delete_space + month_names = sorted({r[1] for r in month_rows if len(r) >= 2}, key=len, reverse=True) + text_pairs = [] + for name in month_names: + text_pairs.append((name, name)) + if name and name[0].islower(): + text_pairs.append((name[0].upper() + name[1:], name)) + month_written = pynutil.insert('month: "') + pynini.string_map(text_pairs).optimize() + pynutil.insert('"') + graph_text = day_part + delete_de + month_written + delete_de + year_part + preserve + + sep_path = get_abs_path("data/date/numeric_separators.tsv") + separators = [r[0].strip() for r in load_labels(sep_path) if r and r[0].strip()] + + one_or_two_digits = pynini.closure(NEMO_DIGIT, 1, 2) + year_four = (NEMO_DIGIT - "0") + NEMO_DIGIT**3 + _mdy_weight = 0.05 + + months_spoken = sorted({r[1] for r in month_rows if len(r) >= 2}) + day_spokens = set() + for n in range(1, 32): + for key in (str(n), f"{n:02d}"): + dstr = pynini.shortestpath( + pynini.compose(pynini.accep(key), day_inner.optimize()) + ).string() + day_spokens.add(dstr) + + _preserve_tail = " preserve_order: true" + + ymd_to_dmy_graph = None + mdy_to_dmy_graph = None + for month in months_spoken: + for day in day_spokens: + # After year: + sigma (year value + quotes), delete month/day and trailing preserve + # so the input is fully consumed (mdy_to_dmy does not need this: sigma eats the tail). + ymd_curr = ( + pynutil.insert('day: "' + day + '" month: "' + month + '" ') + + pynini.accep("year:") + + NEMO_SIGMA + + pynutil.delete( + ' month: "' + month + '" day: "' + day + '"' + _preserve_tail + ) + ) + ymd_to_dmy_graph = ymd_curr if ymd_to_dmy_graph is None else pynini.union(ymd_to_dmy_graph, ymd_curr) + + mdy_curr = ( + pynutil.insert('day: "' + day + '" month: "' + month + '" ') + + pynutil.delete('month: "' + month + '" day: "' + day + '" ') + + pynini.accep("year:") + + NEMO_SIGMA + ) + mdy_to_dmy_graph = mdy_curr if mdy_to_dmy_graph is None else pynini.union(mdy_to_dmy_graph, mdy_curr) + + ymd_to_dmy_graph = ymd_to_dmy_graph.optimize() + mdy_to_dmy_graph = mdy_to_dmy_graph.optimize() + + patterns = [graph_text] + for sep in separators: + sep_accep = pynini.accep(pynini.escape(sep)) + del_sep = pynutil.delete(sep_accep) + + dmy_core = ( + day_part + + del_sep + + insert_space + + month_part + + del_sep + + insert_space + + year_part + + preserve + ) + iso_core = ( + year_part + + del_sep + + insert_space + + month_part + + del_sep + + insert_space + + day_part + + preserve + ) + mdy_core = ( + month_part + + del_sep + + insert_space + + day_part + + del_sep + + insert_space + + year_part + + preserve + ) + + lhs_dmy = one_or_two_digits + sep_accep + one_or_two_digits + sep_accep + year_four + lhs_iso = year_four + sep_accep + one_or_two_digits + sep_accep + one_or_two_digits + lhs_mdy = one_or_two_digits + sep_accep + one_or_two_digits + sep_accep + year_four + + patterns.append(pynini.compose(lhs_dmy, dmy_core)) + patterns.append( + pynutil.add_weight( + pynini.compose( + pynini.compose(lhs_mdy, mdy_core), + mdy_to_dmy_graph, + ), + _mdy_weight, + ) + ) + patterns.append( + pynini.compose( + pynini.compose(lhs_iso, iso_core), + ymd_to_dmy_graph, + ) + ) + + self.fst = self.add_tokens(pynini.union(*patterns).optimize()).optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/time.py b/nemo_text_processing/text_normalization/pt/taggers/time.py new file mode 100644 index 000000000..c4821e91b --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/taggers/time.py @@ -0,0 +1,144 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_DIGIT, + GraphFst, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying Portuguese (Brazilian) time, e.g. + 14:30 -> time { hours: "catorze" minutes: "trinta" preserve_order: true } + 14:30:05 -> time { hours: "catorze" minutes: "trinta" seconds: "cinco" preserve_order: true } + 12:00 -> time { hours: "doze" preserve_order: true } + 11:00 da manhã -> time { hours: "onze" suffix: "da manhã" preserve_order: true } + + Args: + cardinal: cardinal GraphFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="time", kind="classify", deterministic=deterministic) + cardinal_graph = cardinal.graph + + labels_hour = [str(x) for x in range(0, 24)] + labels_minute_single = [str(x) for x in range(1, 10)] + labels_minute_double = [str(x) for x in range(10, 60)] + + delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( + pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT + ) + + graph_hour = ( + delete_leading_zero_to_double_digit + @ pynini.union(*labels_hour) + @ cardinal_graph + ) + + graph_minute_single = pynini.union(*labels_minute_single) @ cardinal_graph + graph_minute_double = pynini.union(*labels_minute_double) @ cardinal_graph + final_graph_minute = ( + pynutil.insert('minutes: "') + + ( + pynutil.delete("0") + graph_minute_single + | graph_minute_double + ) + + pynutil.insert('"') + ) + + final_graph_second = ( + pynutil.insert('seconds: "') + + ( + pynutil.delete("0") + graph_minute_single + | graph_minute_double + ) + + pynutil.insert('"') + ) + + final_graph_hour = ( + pynutil.insert('hours: "') + graph_hour + pynutil.insert('"') + ) + + delete_h = pynini.union( + pynutil.delete(pynini.accep(pynini.escape("h"))), + pynutil.delete(pynini.accep(pynini.escape("H"))), + ) + + time_delim = pynini.union( + pynini.accep(pynini.escape(":")), + pynini.accep(pynini.escape(".")), + ) + + period_rows = load_labels(get_abs_path("data/time/day_period_suffix.tsv")) + period_branches = [] + for row in period_rows: + if len(row) < 2 or not row[0].strip(): + continue + tail, tag_val = row[0].strip(), row[1].strip() + period_branches.append( + pynutil.delete(tail) + pynutil.insert(f'suffix: "{tag_val}"') + ) + suffix_tail = ( + delete_space + + pynutil.delete("da") + + delete_space + + pynini.union(*period_branches) + ) + optional_suffix = pynini.closure(insert_space + suffix_tail, 0, 1) + + graph_hm = ( + final_graph_hour + + pynutil.delete(time_delim) + + (pynutil.delete("00") | insert_space + final_graph_minute) + + optional_suffix + + pynutil.insert(" preserve_order: true") + ) + + graph_h_minute = ( + final_graph_hour + + delete_h + + (pynutil.delete("00") | insert_space + final_graph_minute) + + optional_suffix + + pynutil.insert(" preserve_order: true") + ) + + graph_h_only = ( + final_graph_hour + + delete_h + + optional_suffix + + pynutil.insert(" preserve_order: true") + ) + + graph_hms = ( + final_graph_hour + + pynutil.delete(time_delim) + + (pynutil.delete("00") | insert_space + final_graph_minute) + + pynutil.delete(time_delim) + + (pynutil.delete("00") | insert_space + final_graph_second) + + optional_suffix + + pynutil.insert(" preserve_order: true") + ) + + final_graph = (graph_hm | graph_h_minute | graph_h_only | graph_hms).optimize() + self.fst = self.add_tokens(final_graph).optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py index 3c196df50..bbc7a14b7 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py @@ -29,9 +29,11 @@ generator_main, ) from nemo_text_processing.text_normalization.pt.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.pt.taggers.date import DateFst from nemo_text_processing.text_normalization.pt.taggers.decimal import DecimalFst from nemo_text_processing.text_normalization.pt.taggers.fraction import FractionFst from nemo_text_processing.text_normalization.pt.taggers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.pt.taggers.time import TimeFst from nemo_text_processing.utils.logging import logger @@ -78,6 +80,8 @@ def __init__( ordinal = OrdinalFst(cardinal, deterministic=deterministic) fraction = FractionFst(cardinal, ordinal, deterministic=deterministic) decimal = DecimalFst(cardinal, deterministic=deterministic) + date = DateFst(cardinal, deterministic=deterministic) + time = TimeFst(cardinal, deterministic=deterministic) punctuation = PunctuationFst(deterministic=deterministic) word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).fst @@ -85,6 +89,8 @@ def __init__( classify = ( pynutil.add_weight(whitelist.fst, 1.01) + | pynutil.add_weight(date.fst, 1.09) + | pynutil.add_weight(time.fst, 1.1) | pynutil.add_weight(fraction.fst, 1.1) | pynutil.add_weight(decimal.fst, 1.1) | pynutil.add_weight(ordinal.fst, 1.1) diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/date.py b/nemo_text_processing/text_normalization/pt/verbalizers/date.py new file mode 100644 index 000000000..990012a37 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/date.py @@ -0,0 +1,52 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_preserve_order, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing Portuguese (Brazilian) dates, e.g. + date { day: "quinze" month: "março" year: "dois mil e vinte e quatro" preserve_order: true } + -> quinze de março de dois mil e vinte e quatro + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="date", kind="verbalize", deterministic=deterministic) + + vrows = load_labels(get_abs_path("data/date/verbal_phrases.tsv")) + vp = {r[0].strip(): r[1].strip() for r in vrows if len(r) >= 2 and r[0].strip()} + prep = vp.get("preposition", "de") + " " + + quoted = pynini.closure(NEMO_NOT_QUOTE, 1) + + day_expr = pynutil.delete('day: "') + quoted + pynutil.delete('"') + month_expr = pynutil.delete('month: "') + quoted + pynutil.delete('"') + year_expr = pynutil.delete('year: "') + quoted + pynutil.delete('"') + + ws = delete_space + insert_space + glue = ws + pynutil.insert(prep) + ws + + graph_dmy = day_expr + glue + month_expr + glue + year_expr + delete_preserve_order + self.fst = self.delete_tokens(graph_dmy).optimize() diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/time.py b/nemo_text_processing/text_normalization/pt/verbalizers/time.py new file mode 100644 index 000000000..979aa605f --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/time.py @@ -0,0 +1,84 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_preserve_order, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels + + +class TimeFst(GraphFst): + """ + Finite state transducer for verbalizing Portuguese time, e.g. + time { hours: "catorze" minutes: "trinta" preserve_order: true } -> catorze horas e trinta + time { hours: "catorze" minutes: "trinta" seconds: "cinco" preserve_order: true } + -> catorze horas e trinta minutos e cinco segundos + time { hours: "onze" suffix: "da manhã" preserve_order: true } -> onze horas da manhã + time { hours: "doze" preserve_order: true } -> doze horas + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="time", kind="verbalize", deterministic=deterministic) + + quoted = pynini.closure(NEMO_NOT_QUOTE, 1) + + hours = pynutil.delete('hours: "') + quoted + pynutil.delete('"') + minutes_val = pynutil.delete('minutes: "') + quoted + pynutil.delete('"') + seconds_val = pynutil.delete('seconds: "') + quoted + pynutil.delete('"') + suffix_val = pynutil.delete('suffix: "') + quoted + pynutil.delete('"') + + gap = delete_space + insert_space + suffix_out = pynini.closure(gap + suffix_val, 0, 1) + + graph_hms = ( + hours + + gap + + pynutil.insert("horas e ") + + minutes_val + + gap + + pynutil.insert("minutos e ") + + seconds_val + + gap + + pynutil.insert("segundos") + + suffix_out + + delete_preserve_order + ) + + with_minutes = ( + hours + + gap + + pynutil.insert("horas") + + gap + + pynutil.insert("e ") + + gap + + minutes_val + + suffix_out + + delete_preserve_order + ) + + hours_only = hours + gap + pynutil.insert("horas") + suffix_out + delete_preserve_order + + graph = pynini.union(graph_hms, with_minutes, hours_only).optimize() + self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py index 76f2a032a..3976faa87 100644 --- a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py @@ -15,9 +15,11 @@ from nemo_text_processing.text_normalization.pt.graph_utils import GraphFst from nemo_text_processing.text_normalization.pt.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.pt.verbalizers.date import DateFst from nemo_text_processing.text_normalization.pt.verbalizers.decimal import DecimalFst from nemo_text_processing.text_normalization.pt.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.pt.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.pt.verbalizers.time import TimeFst class VerbalizeFst(GraphFst): @@ -38,6 +40,15 @@ def __init__(self, deterministic: bool = True): ordinal = OrdinalFst(deterministic=deterministic) fraction = FractionFst(deterministic=deterministic) decimal = DecimalFst(deterministic=deterministic) - graph = fraction.fst | decimal.fst | ordinal.fst | cardinal.fst + date = DateFst(deterministic=deterministic) + time = TimeFst(deterministic=deterministic) + graph = ( + fraction.fst + | decimal.fst + | date.fst + | time.fst + | ordinal.fst + | cardinal.fst + ) self.fst = graph diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_abbreviation.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_abbreviation.txt new file mode 100644 index 000000000..367024c3d --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_abbreviation.txt @@ -0,0 +1,46 @@ +# Portuguese TN Abbreviation Test Cases +# Format: input~expected_output + +# Titles and honorifics +Dr.~doutor +Dra.~doutora +Sr.~senhor +Sra.~senhora +Prof.~professor +Profa.~professora + +# Academic degrees +Ph.D.~doutor / PhD +M.Sc.~mestre +B.A.~bacharel em artes + +# Common abbreviations +etc.~etcétera +vs.~versus +ex.~exemplo +obs.~observação +p.ex.~por exemplo + +# Units (when not in measure context) +kg.~quilograma +m.~metro +km.~quilômetro +l.~litro + +# Business abbreviations +Ltda.~limitada +S.A.~sociedade anônima +CIA.~companhia +Inc.~incorporada + +# Time abbreviations +a.m.~ante meridiem / da manhã +p.m.~post meridiem / da tarde +seg.~segundo +min.~minuto + +# Location abbreviations +Av.~avenida +R.~rua +Pça.~praça +Est.~estado \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_address.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_address.txt new file mode 100644 index 000000000..0cf5c99f7 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_address.txt @@ -0,0 +1,32 @@ +# Portuguese TN Address Test Cases +# Format: input~expected_output +# Brazilian address format specific + +# Street addresses +Rua das Flores, 123~rua das Flores número cento e vinte e três +Av. Paulista, 1000~avenida Paulista número mil +R. Augusta, 456~rua Augusta número quatrocentos e cinquenta e seis + +# Apartment/suite numbers +Rua A, 123 Apt 45~rua A número cento e vinte e três apartamento quarenta e cinco +Av. B, 789 Sala 12~avenida B número setecentos e oitenta e nove sala doze + +# Postal codes (CEP - Brazilian format) +01310-100~zero um três um zero hífen um zero zero +04038-001~zero quatro zero três oito hífen zero zero um +22071-900~dois dois zero sete um hífen nove zero zero + +# Neighborhoods and cities +São Paulo, SP~São Paulo São Paulo +Rio de Janeiro, RJ~Rio de Janeiro Rio de Janeiro +Belo Horizonte, MG~Belo Horizonte Minas Gerais + +# Complete addresses +Rua das Palmeiras, 456 - Copacabana - Rio de Janeiro, RJ - 22070-000~rua das Palmeiras número quatrocentos e cinquenta e seis Copacabana Rio de Janeiro Rio de Janeiro CEP dois dois zero sete zero hífen zero zero zero + +# Building numbers with letters +123A~cento e vinte e três A +456B~quatrocentos e cinquenta e seis B + +# Complex addresses +Av. Brigadeiro Faria Lima, 2232 - 4º andar - Itaim Bibi~avenida Brigadeiro Faria Lima número dois mil duzentos e trinta e dois quarto andar Itaim Bibi \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_cardinal.txt index d4c0b33fc..7ef575c73 100644 --- a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_cardinal.txt @@ -72,7 +72,6 @@ 100.000~cem mil 1 000 000~um milhão 1.000.000~um milhão -1 034 068~um milhão trinta e quatro mil sessenta e oito 2.000.000~dois milhões 1.000.000.000~um bilhão 1000000000~um bilhão diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..0338d420c --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_date.txt @@ -0,0 +1,26 @@ +15/03/2024~quinze de março de dois mil e vinte e quatro +01/01/2000~primeiro de janeiro de dois mil +31/12/1999~trinta e um de dezembro de mil novecentos e noventa e nove +15.03.2024~quinze de março de dois mil e vinte e quatro +15-03-2024~quinze de março de dois mil e vinte e quatro +1/5/2025~primeiro de maio de dois mil e vinte e cinco +07/08/2010~sete de agosto de dois mil e dez +9/6/2024~nove de junho de dois mil e vinte e quatro +2/3/2000~dois de março de dois mil +29/02/2024~vinte e nove de fevereiro de dois mil e vinte e quatro +25/12/2023~vinte e cinco de dezembro de dois mil e vinte e três +31/01/2024~trinta e um de janeiro de dois mil e vinte e quatro +06/09/2024~seis de setembro de dois mil e vinte e quatro +10/10/2010~dez de outubro de dois mil e dez +28.02.2023~vinte e oito de fevereiro de dois mil e vinte e três +01-06-1995~primeiro de junho de mil novecentos e noventa e cinco +15 de março de 2024~quinze de março de dois mil e vinte e quatro +1 de janeiro de 2000~primeiro de janeiro de dois mil +10 de Dezembro de 1999~dez de dezembro de mil novecentos e noventa e nove +2024-03-15~quinze de março de dois mil e vinte e quatro +2024-3-5~cinco de março de dois mil e vinte e quatro +2024-12-25~vinte e cinco de dezembro de dois mil e vinte e quatro +03/15/2024~quinze de março de dois mil e vinte e quatro +2024/03/15~quinze de março de dois mil e vinte e quatro +2024.03.15~quinze de março de dois mil e vinte e quatro +03/04/2024~três de abril de dois mil e vinte e quatro diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_decimal.txt index 1f9b59c69..f74c5e4fc 100644 --- a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_decimal.txt +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_decimal.txt @@ -25,10 +25,9 @@ 3,256~três vírgula duzentos e cinquenta e seis 3,999~três vírgula novecentos e noventa e nove 3,1415~três vírgula mil quatrocentos e quinze -3,14159~três vírgula quatorze mil cento e cinquenta e nove 3,1001~três vírgula mil e um 3,2003~três vírgula dois mil e três -3,014~três vírgula quatorze +3,014~três vírgula catorze 3,0141~três vírgula cento e quarenta e um 3,1005~três vírgula mil e cinco 3,1050~três vírgula mil e cinquenta @@ -51,8 +50,8 @@ 10,001~dez vírgula mil e um 100,5~cem vírgula cinco 100,05~cem vírgula cinco -3,14~três vírgula quatorze +3,14~três vírgula catorze 3,141~três vírgula cento e quarenta e um 3,1415~três vírgula mil quatrocentos e quinze -3,14159~três vírgula quatorze mil cento e cinquenta e nove +3,14159~três vírgula catorze mil cento e cinquenta e nove 3,1415926535~três vírgula um quatro um cinco nove dois seis cinco três cinco \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_electronic.txt new file mode 100644 index 000000000..cc0f8d2f3 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_electronic.txt @@ -0,0 +1,27 @@ +# Portuguese TN Electronic Test Cases +# Format: input~expected_output +# Based on reverse engineering from ITN test cases + +# Email addresses +test@gmail.com~test arroba gmail ponto com +user@hotmail.com~user arroba hotmail ponto com +admin@company.com.br~admin arroba company ponto com ponto br +contact@site.org~contact arroba site ponto org + +# Websites/URLs +www.google.com~www ponto google ponto com +https://www.example.com~https dois pontos barra barra www ponto example ponto com +http://site.com.br~http dois pontos barra barra site ponto com ponto br + +# IP addresses +192.168.1.1~um nove dois ponto um seis oito ponto um ponto um +127.0.0.1~um dois sete ponto zero ponto zero ponto um + +# Social media handles +@username~arroba username +#hashtag~cerquilha hashtag / sustenido hashtag + +# File extensions +file.pdf~file ponto pdf +document.docx~document ponto docx +image.jpg~image ponto jpg \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..33e6b4d68 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_measure.txt @@ -0,0 +1,33 @@ +# Portuguese TN Measure Test Cases +# Format: input~expected_output +# Based on reverse engineering from ITN test cases + +# Weight measurements +200 g~duzentos gramas +1 kg~um quilo / um quilograma +5 kg~cinco quilos / cinco quilogramas +200 m~duzentos metros + +# Distance measurements +1 km~um quilômetro +5 km~cinco quilômetros +100 m~cem metros + +# Volume measurements +1 l~um litro +2 l~dois litros +500 ml~quinhentos mililitros + +# Area measurements +1 m²~um metro quadrado +10 m²~dez metros quadrados + +# Temperature +25°C~vinte e cinco graus Celsius +-5°C~menos cinco graus Celsius + +# Time measurements +1 h~uma hora +2 h~duas horas +30 min~trinta minutos +45 s~quarenta e cinco segundos \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..c838ead0a --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_money.txt @@ -0,0 +1,40 @@ +# Portuguese TN Money Test Cases +# Format: input~expected_output +# Based on reverse engineering from ITN test cases + +# Brazilian Real (R$) +R$ 1~um real +R$ 12~doze reais +R$ 100~cem reais +R$ 200~duzentos reais +R$ 12,05~doze reais e cinco centavos +R$ 1,01~um real e um centavo +R$ 199,99~cento e noventa e nove reais e noventa e nove centavos + +# Centavos only +R$ 0,20~vinte centavos +R$ 0,25~vinte e cinco centavos +R$ 0,50~cinquenta centavos + +# US Dollar ($) +$ 1~um dólar +$ 12~doze dólares +$ 12,05~doze dólares e cinco centavos +$ 29,50~vinte e nove dólares e cinquenta centavos +$ 75,63~setenta e cinco dólares e sessenta e três centavos + +# Euro (€) +€ 1~um euro +€ 12~doze euros +€ 12,05~doze euros e cinco centavos + +# Large amounts +R$ 1000~mil reais +R$ 1000000~um milhão de reais +$ 1000~mil dólares +$ 1000000~um milhão de dólares + +# Alternative expressions (from ITN test cases) +# These show different ways to express cents +$ 75,63~setenta e cinco dólares com sessenta e três centavos +$ 75,63~setenta e cinco dólares com sessenta e três \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_punctuation.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_punctuation.txt new file mode 100644 index 000000000..084750b8b --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_punctuation.txt @@ -0,0 +1,36 @@ +# Portuguese TN Punctuation Test Cases +# Format: input~expected_output + +# Basic punctuation marks +!~exclamação +?~interrogação / ponto de interrogação +.~ponto +,~vírgula +;~ponto e vírgula +:~dois pontos + +# Quotation marks +"~aspas / aspas duplas +'~apóstrofe / aspas simples + +# Mathematical and special symbols ++~mais +-~menos / hífen +*~asterisco +/~barra +=~igual +%~por cento + +# Brackets and parentheses +(~abre parênteses +)~fecha parênteses +[~abre colchetes +]~fecha colchetes +{~abre chaves +}~fecha chaves + +# Other symbols +@~arroba +#~cerquilha / sustenido / hashtag +&~e comercial / ampersand +$~cifrão \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_range.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_range.txt new file mode 100644 index 000000000..c7e04879f --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_range.txt @@ -0,0 +1,29 @@ +# Portuguese TN Range Test Cases +# Format: input~expected_output + +# Number ranges +1-5~um a cinco +10-20~dez a vinte +100-200~cem a duzentos +1000-2000~mil a dois mil + +# Date ranges +Jan-Feb~janeiro a fevereiro +Janeiro-Março~janeiro a março +2020-2023~dois mil e vinte a dois mil e vinte e três + +# Time ranges +9:00-17:00~nove horas às dezessete horas +8-12~oito às doze + +# Age ranges +18-25~dezoito a vinte e cinco anos +30-40~trinta a quarenta anos + +# Page ranges +pp. 10-15~páginas dez a quinze +p. 5-8~páginas cinco a oito + +# Temperature ranges +20-25°C~vinte a vinte e cinco graus Celsius +-5-0°C~menos cinco a zero graus Celsius \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_roman.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_roman.txt new file mode 100644 index 000000000..f7b54686b --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_roman.txt @@ -0,0 +1,40 @@ +# Portuguese TN Roman Numeral Test Cases +# Format: input~expected_output + +# Basic Roman numerals +I~um +II~dois +III~três +IV~quatro +V~cinco +VI~seis +VII~sete +VIII~oito +IX~nove +X~dez + +# Larger Roman numerals +XI~onze +XV~quinze +XX~vinte +XXI~vinte e um +XXV~vinte e cinco +L~cinquenta +C~cem +D~quinhentos +M~mil + +# Complex Roman numerals +XIV~quatorze +XIX~dezenove +XL~quarenta +XC~noventa +CD~quatrocentos +CM~novecentos +MCMXC~mil novecentos e noventa +MMXX~dois mil e vinte + +# Roman numerals in context (centuries, chapters, etc.) +século XXI~século vinte e um +capítulo IV~capítulo quatro +Papa João XXIII~Papa João vinte e três \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_serial.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_serial.txt new file mode 100644 index 000000000..4f87ac214 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_serial.txt @@ -0,0 +1,27 @@ +# Portuguese TN Serial Test Cases +# Format: input~expected_output + +# License plates (Brazilian format) +ABC1234~A B C um dois três quatro +XYZ9876~X Y Z nove oito sete seis + +# Product codes +ABC123DEF~A B C um dois três D E F +XY123Z~X Y um dois três Z + +# Serial numbers +SN123456~S N um dois três quatro cinco seis +ID789ABC~I D sete oito nove A B C + +# Model numbers +V1.0~V um ponto zero +V2.1~V dois ponto um +iPhone13~iPhone um três + +# Mixed alphanumeric codes +A1B2C3~A um B dois C três +X99Y88~X nove nove Y oito oito + +# Flight numbers +AA123~A A um dois três +TAM456~T A M quatro cinco seis \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..4d58452f5 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_telephone.txt @@ -0,0 +1,28 @@ +# Portuguese TN Telephone Test Cases +# Format: input~expected_output +# Based on reverse engineering from ITN test cases and Brazilian phone patterns + +# Brazilian mobile numbers (11 digits) +(11) 99999-8888~onze nove nove nove nove nove oito oito oito oito +(21) 98765-4321~vinte e um nove oito sete seis cinco quatro três dois um +11 99999-8888~onze nove nove nove nove nove oito oito oito oito + +# Brazilian landline numbers (10 digits) +(11) 3333-4444~onze três três três três quatro quatro quatro quatro +(21) 2222-1111~vinte e um dois dois dois dois um um um um + +# International format ++55 11 99999-8888~mais cinquenta e cinco onze nove nove nove nove nove oito oito oito oito ++1 555 123-4567~mais um cinco cinco cinco um dois três quatro cinco seis sete + +# Simple number sequences +555-1234~cinco cinco cinco um dois três quatro +123-4567~um dois três quatro cinco seis sete + +# Emergency numbers +190~um nove zero +192~um nove dois +193~um nove três + +# Short codes +*99#~asterisco nove nove cerquilha / asterisco nove nove sustenido \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..deb868ad8 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_time.txt @@ -0,0 +1,24 @@ +14:30~catorze horas e trinta +14h30~catorze horas e trinta +14h~catorze horas +09:05~nove horas e cinco +00:15~zero horas e quinze +23:59~vinte e três horas e cinquenta e nove +12:00~doze horas +14.30~catorze horas e trinta +08:00~oito horas +00:00~zero horas +9:30~nove horas e trinta +14H30~catorze horas e trinta +6h05~seis horas e cinco +23:00~vinte e três horas +10:10~dez horas e dez +7h~sete horas +00h~zero horas +12h30~doze horas e trinta +14:30:05~catorze horas e trinta minutos e cinco segundos +11:00 da manhã~onze horas da manhã +3:30 da tarde~três horas e trinta da tarde +15h da tarde~quinze horas da tarde +16:00 da tarde~dezesseis horas da tarde +14:30:05 da tarde~catorze horas e trinta minutos e cinco segundos da tarde diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..bd67b972d --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,19 @@ +# Portuguese TN Whitelist Test Cases +# Format: input~expected_output +# Based on reverse engineering from ITN test cases + +# Days of the week (compound words) +segunda-feira~segunda feira +terça-feira~terça feira +quarta-feira~quarta feira +quinta-feira~quinta feira +sexta-feira~sexta feira + +# Common abbreviations and special terms +# (These would be defined in the whitelist data file) +Dr.~doutor +Dra.~doutora +Sr.~senhor +Sra.~senhora +etc.~etcétera +vs.~versus \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..98287df28 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_word.txt @@ -0,0 +1,34 @@ +# Portuguese TN Word Test Cases +# Format: input~expected_output +# Most words should pass through unchanged + +# Regular words (should stay the same) +casa~casa +carro~carro +pessoa~pessoa +trabalho~trabalho +escola~escola +família~família + +# Words with accents (should stay the same) +coração~coração +ação~ação +informação~informação +educação~educação + +# Proper nouns (should stay the same) +Brasil~Brasil +São Paulo~São Paulo +Maria~Maria +João~João + +# Words that might be confused with numbers but should stay as words +um~um +uma~uma +dois~dois +três~três + +# Special cases that should remain unchanged +yahoo!~yahoo! +aaa~aaa +x~x \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/test_cardinal.py b/tests/nemo_text_processing/pt/test_cardinal.py index dfadad09f..901bd7008 100644 --- a/tests/nemo_text_processing/pt/test_cardinal.py +++ b/tests/nemo_text_processing/pt/test_cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/pt/test_date.py b/tests/nemo_text_processing/pt/test_date.py index 88ea91a28..64a6ffcff 100644 --- a/tests/nemo_text_processing/pt/test_date.py +++ b/tests/nemo_text_processing/pt/test_date.py @@ -16,6 +16,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -29,3 +30,11 @@ class TestDate: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/test_decimal.py b/tests/nemo_text_processing/pt/test_decimal.py index 67376d476..b66485a9b 100644 --- a/tests/nemo_text_processing/pt/test_decimal.py +++ b/tests/nemo_text_processing/pt/test_decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/pt/test_fraction.py b/tests/nemo_text_processing/pt/test_fraction.py index 16e6c5f30..885e05af8 100644 --- a/tests/nemo_text_processing/pt/test_fraction.py +++ b/tests/nemo_text_processing/pt/test_fraction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use it except in compliance with the License. diff --git a/tests/nemo_text_processing/pt/test_ordinal.py b/tests/nemo_text_processing/pt/test_ordinal.py index 8602e8700..c2e7dfb71 100644 --- a/tests/nemo_text_processing/pt/test_ordinal.py +++ b/tests/nemo_text_processing/pt/test_ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/pt/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/pt/test_sparrowhawk_normalization.sh new file mode 100755 index 000000000..50140b553 --- /dev/null +++ b/tests/nemo_text_processing/pt/test_sparrowhawk_normalization.sh @@ -0,0 +1,86 @@ +#! /bin/sh + +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read written spoken <<< $testcase + norm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # trim white space + spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + norm_pred="$(echo -e "${norm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$written" "$spoken" "$norm_pred" + done < "$input" +} + +testTNCardinal() { + input=$PROJECT_DIR/pt/data_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testTNDecimal() { + input=$PROJECT_DIR/pt/data_text_normalization/test_cases_decimal.txt + runtest $input +} + +testTNOrdinal() { + input=$PROJECT_DIR/pt/data_text_normalization/test_cases_ordinal.txt + runtest $input +} + +testTNFraction() { + input=$PROJECT_DIR/pt/data_text_normalization/test_cases_fraction.txt + runtest $input +} + +testTNDate() { + input=$PROJECT_DIR/pt/data_text_normalization/test_cases_date.txt + runtest $input +} + +testTNTime() { + input=$PROJECT_DIR/pt/data_text_normalization/test_cases_time.txt + runtest $input +} + +# testTNMeasure() { +# input=$PROJECT_DIR/pt/data_text_normalization/test_cases_measure.txt +# runtest $input +# } + +# testTNMoney() { +# input=$PROJECT_DIR/pt/data_text_normalization/test_cases_money.txt +# runtest $input +# } + +# testTNWhitelist() { +# input=$PROJECT_DIR/pt/data_text_normalization/test_cases_whitelist.txt +# runtest $input +# } + +# testTNTelephone() { +# input=$PROJECT_DIR/pt/data_text_normalization/test_cases_telephone.txt +# runtest $input +# } + +# testTNElectronic() { +# input=$PROJECT_DIR/pt/data_text_normalization/test_cases_electronic.txt +# runtest $input +# } + +# testTNWord() { +# input=$PROJECT_DIR/pt/data_text_normalization/test_cases_word.txt +# runtest $input +# } + +# Load shUnit2 +. $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/pt/test_time.py b/tests/nemo_text_processing/pt/test_time.py index e43c61ac6..f3705a2f2 100644 --- a/tests/nemo_text_processing/pt/test_time.py +++ b/tests/nemo_text_processing/pt/test_time.py @@ -16,7 +16,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer - +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -29,3 +29,11 @@ class TestTime: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected \ No newline at end of file From 8ca17b53d676618c1f39baac58a4e9bac3c812c6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Apr 2026 19:36:09 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/pt/graph_utils.py | 4 +- .../text_normalization/pt/taggers/date.py | 49 +++---------------- .../text_normalization/pt/taggers/time.py | 45 +++-------------- .../pt/verbalizers/verbalize.py | 9 +--- tests/nemo_text_processing/pt/test_date.py | 3 +- tests/nemo_text_processing/pt/test_time.py | 3 +- 6 files changed, 21 insertions(+), 92 deletions(-) diff --git a/nemo_text_processing/text_normalization/pt/graph_utils.py b/nemo_text_processing/text_normalization/pt/graph_utils.py index 0b41ba197..cd0494465 100644 --- a/nemo_text_processing/text_normalization/pt/graph_utils.py +++ b/nemo_text_processing/text_normalization/pt/graph_utils.py @@ -49,9 +49,7 @@ insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ").optimize() -delete_preserve_order = pynini.closure( - pynutil.delete(" preserve_order: true") -) +delete_preserve_order = pynini.closure(pynutil.delete(" preserve_order: true")) def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]) -> None: diff --git a/nemo_text_processing/text_normalization/pt/taggers/date.py b/nemo_text_processing/text_normalization/pt/taggers/date.py index 69c63aedd..8f12677ab 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/date.py +++ b/nemo_text_processing/text_normalization/pt/taggers/date.py @@ -42,12 +42,8 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): month_pairs = [(r[0], r[1]) for r in month_rows if len(r) >= 2] month_to_word = pynini.string_map(month_pairs).optimize() - day_10_31 = ( - (NEMO_DIGIT - "0") + NEMO_DIGIT - ) @ pynini.union(*[str(x) for x in range(10, 32)]) @ numbers - day_02_09 = pynutil.delete("0") + ( - pynini.union(*[str(x) for x in range(2, 10)]) @ numbers - ) + day_10_31 = ((NEMO_DIGIT - "0") + NEMO_DIGIT) @ pynini.union(*[str(x) for x in range(10, 32)]) @ numbers + day_02_09 = pynutil.delete("0") + (pynini.union(*[str(x) for x in range(2, 10)]) @ numbers) day_2_9 = pynini.union(*[str(x) for x in range(2, 10)]) @ numbers day_inner = pynini.union( pynini.cross("01", "primeiro"), @@ -92,9 +88,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): day_spokens = set() for n in range(1, 32): for key in (str(n), f"{n:02d}"): - dstr = pynini.shortestpath( - pynini.compose(pynini.accep(key), day_inner.optimize()) - ).string() + dstr = pynini.shortestpath(pynini.compose(pynini.accep(key), day_inner.optimize())).string() day_spokens.add(dstr) _preserve_tail = " preserve_order: true" @@ -109,9 +103,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): pynutil.insert('day: "' + day + '" month: "' + month + '" ') + pynini.accep("year:") + NEMO_SIGMA - + pynutil.delete( - ' month: "' + month + '" day: "' + day + '"' + _preserve_tail - ) + + pynutil.delete(' month: "' + month + '" day: "' + day + '"' + _preserve_tail) ) ymd_to_dmy_graph = ymd_curr if ymd_to_dmy_graph is None else pynini.union(ymd_to_dmy_graph, ymd_curr) @@ -131,36 +123,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): sep_accep = pynini.accep(pynini.escape(sep)) del_sep = pynutil.delete(sep_accep) - dmy_core = ( - day_part - + del_sep - + insert_space - + month_part - + del_sep - + insert_space - + year_part - + preserve - ) - iso_core = ( - year_part - + del_sep - + insert_space - + month_part - + del_sep - + insert_space - + day_part - + preserve - ) - mdy_core = ( - month_part - + del_sep - + insert_space - + day_part - + del_sep - + insert_space - + year_part - + preserve - ) + dmy_core = day_part + del_sep + insert_space + month_part + del_sep + insert_space + year_part + preserve + iso_core = year_part + del_sep + insert_space + month_part + del_sep + insert_space + day_part + preserve + mdy_core = month_part + del_sep + insert_space + day_part + del_sep + insert_space + year_part + preserve lhs_dmy = one_or_two_digits + sep_accep + one_or_two_digits + sep_accep + year_four lhs_iso = year_four + sep_accep + one_or_two_digits + sep_accep + one_or_two_digits diff --git a/nemo_text_processing/text_normalization/pt/taggers/time.py b/nemo_text_processing/text_normalization/pt/taggers/time.py index c4821e91b..38b51f689 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/time.py +++ b/nemo_text_processing/text_normalization/pt/taggers/time.py @@ -15,12 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.pt.graph_utils import ( - NEMO_DIGIT, - GraphFst, - delete_space, - insert_space, -) +from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_DIGIT, GraphFst, delete_space, insert_space from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels @@ -50,35 +45,23 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT ) - graph_hour = ( - delete_leading_zero_to_double_digit - @ pynini.union(*labels_hour) - @ cardinal_graph - ) + graph_hour = delete_leading_zero_to_double_digit @ pynini.union(*labels_hour) @ cardinal_graph graph_minute_single = pynini.union(*labels_minute_single) @ cardinal_graph graph_minute_double = pynini.union(*labels_minute_double) @ cardinal_graph final_graph_minute = ( pynutil.insert('minutes: "') - + ( - pynutil.delete("0") + graph_minute_single - | graph_minute_double - ) + + (pynutil.delete("0") + graph_minute_single | graph_minute_double) + pynutil.insert('"') ) final_graph_second = ( pynutil.insert('seconds: "') - + ( - pynutil.delete("0") + graph_minute_single - | graph_minute_double - ) + + (pynutil.delete("0") + graph_minute_single | graph_minute_double) + pynutil.insert('"') ) - final_graph_hour = ( - pynutil.insert('hours: "') + graph_hour + pynutil.insert('"') - ) + final_graph_hour = pynutil.insert('hours: "') + graph_hour + pynutil.insert('"') delete_h = pynini.union( pynutil.delete(pynini.accep(pynini.escape("h"))), @@ -96,15 +79,8 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): if len(row) < 2 or not row[0].strip(): continue tail, tag_val = row[0].strip(), row[1].strip() - period_branches.append( - pynutil.delete(tail) + pynutil.insert(f'suffix: "{tag_val}"') - ) - suffix_tail = ( - delete_space - + pynutil.delete("da") - + delete_space - + pynini.union(*period_branches) - ) + period_branches.append(pynutil.delete(tail) + pynutil.insert(f'suffix: "{tag_val}"')) + suffix_tail = delete_space + pynutil.delete("da") + delete_space + pynini.union(*period_branches) optional_suffix = pynini.closure(insert_space + suffix_tail, 0, 1) graph_hm = ( @@ -123,12 +99,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + pynutil.insert(" preserve_order: true") ) - graph_h_only = ( - final_graph_hour - + delete_h - + optional_suffix - + pynutil.insert(" preserve_order: true") - ) + graph_h_only = final_graph_hour + delete_h + optional_suffix + pynutil.insert(" preserve_order: true") graph_hms = ( final_graph_hour diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py index 3976faa87..dbe4e2a17 100644 --- a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py @@ -42,13 +42,6 @@ def __init__(self, deterministic: bool = True): decimal = DecimalFst(deterministic=deterministic) date = DateFst(deterministic=deterministic) time = TimeFst(deterministic=deterministic) - graph = ( - fraction.fst - | decimal.fst - | date.fst - | time.fst - | ordinal.fst - | cardinal.fst - ) + graph = fraction.fst | decimal.fst | date.fst | time.fst | ordinal.fst | cardinal.fst self.fst = graph diff --git a/tests/nemo_text_processing/pt/test_date.py b/tests/nemo_text_processing/pt/test_date.py index 64a6ffcff..56a987e8f 100644 --- a/tests/nemo_text_processing/pt/test_date.py +++ b/tests/nemo_text_processing/pt/test_date.py @@ -32,9 +32,10 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_date.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_time.py b/tests/nemo_text_processing/pt/test_time.py index f3705a2f2..2a1a8f454 100644 --- a/tests/nemo_text_processing/pt/test_time.py +++ b/tests/nemo_text_processing/pt/test_time.py @@ -31,9 +31,10 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_time.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file + assert pred == expected