diff --git a/Jenkinsfile b/Jenkinsfile index 33b361cf5..3d7a538ed 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,7 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-17-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/serial/chars.tsv b/nemo_text_processing/text_normalization/hi/data/serial/chars.tsv new file mode 100644 index 000000000..d7c9c39e8 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/serial/chars.tsv @@ -0,0 +1,68 @@ +अ +आ +इ +ई +उ +ऊ +ऋ +ए +ऐ +ओ +औ +ऑ +ा +ि +ी +ु +ू +ृ +े +ै +ो +ौ +ॉ +ं +ः +ँ +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +प +फ +ब +भ +म +य +र +ल +व +श +ष +स +ह +क़ +ख़ +ग़ +ज़ +ड़ +ढ़ +फ़ +य़ +् \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/serial/power_special.tsv b/nemo_text_processing/text_normalization/hi/data/serial/power_special.tsv new file mode 100644 index 000000000..64583f947 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/serial/power_special.tsv @@ -0,0 +1,4 @@ +^2 स्क्वेर्ड +^२ स्क्वेर्ड +^3 क्यूब +^३ क्यूब \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/serial/special_symbols.tsv b/nemo_text_processing/text_normalization/hi/data/serial/special_symbols.tsv new file mode 100644 index 000000000..c96a15bd6 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/serial/special_symbols.tsv @@ -0,0 +1,4 @@ +# हैशटैग +% प्रतिशत +& एंड +@ एट \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/serial.py b/nemo_text_processing/text_normalization/hi/taggers/serial.py new file mode 100644 index 000000000..d7433e583 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/serial.py @@ -0,0 +1,153 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + NEMO_ALPHA, + NEMO_DIGIT, + NEMO_NOT_SPACE, + NEMO_SIGMA, + GraphFst, + convert_space, +) +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + + +class SerialFst(GraphFst): + """ + Finite state transducer for classifying serial strings in Hindi. + Handles Devanagari-numeric mixtures, complex delimited number chains, + symbols, and powers. Supports both ASCII (0-9) and Devanagari (०-९) digits. + + e.g. कोविड-19 -> tokens { name: "कोविड-उन्नीस" } + e.g. 5जी -> tokens { name: "पाँच जी" } + e.g. ३जी -> tokens { name: "तीन जी" } + e.g. 2^2 -> tokens { name: "दो स्क्वेर्ड" } + e.g. 2^4 -> tokens { name: "दो टु द पावर चार" } + e.g. 1-800-555 -> tokens { name: "एक-आठ सौ-पाँच सौ पचपन" } + + Note: Pure Latin-alpha + digit patterns (A12, B-60) are intentionally + excluded here so they fall through to the electronic classifier. + """ + + def __init__( + self, + cardinal: GraphFst, + deterministic: bool = True, + ): + super().__init__(name="serial", kind="classify", deterministic=deterministic) + + digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + zero_graph = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + + devanagari_digits = pynini.project( + pynini.union(digit_graph, zero_graph), + "input", + ).optimize() + + any_digit = pynini.union(NEMO_DIGIT, devanagari_digits).optimize() + + limited_cardinal_graph = ( + cardinal.digit | cardinal.zero | cardinal.teens_and_ties | cardinal.graph_hundreds + ).optimize() + num_graph = limited_cardinal_graph + + symbols_graph = pynini.string_file(get_abs_path("data/serial/special_symbols.tsv")).optimize() + + devanagari_chars = pynini.string_file(get_abs_path("data/serial/chars.tsv")).optimize() + + letter_graph = pynini.string_file(get_abs_path("data/address/letters.tsv")) + latin_letters = letter_graph + pynini.closure(pynutil.insert(" ") + letter_graph) + latin_letters = latin_letters.optimize() + + devanagari_word = pynini.closure(devanagari_chars, 2).optimize() + + delimiter = (pynini.accep("-") | pynini.accep("/") | pynini.accep(" ")).optimize() + + alphas = (latin_letters | devanagari_word).optimize() + segment = (alphas | num_graph | symbols_graph).optimize() + + serial_core = segment + pynini.closure(delimiter + segment, 1) + serial_core = serial_core.optimize() + + serial_graph = serial_core + + all_alphas = pynini.union(NEMO_ALPHA, devanagari_chars).optimize() + + insert_space_alpha_digit = pynini.cdrewrite(pynutil.insert(" "), all_alphas, any_digit, NEMO_SIGMA) + insert_space_digit_alpha = pynini.cdrewrite(pynutil.insert(" "), any_digit, all_alphas, NEMO_SIGMA) + space_inserter = pynini.compose(insert_space_alpha_digit, insert_space_digit_alpha).optimize() + + glued_serial = pynini.compose(space_inserter, serial_core).optimize() + serial_graph = pynini.union(serial_graph, glued_serial).optimize() + + power_special = pynutil.add_weight( + pynini.string_file(get_abs_path("data/serial/power_special.tsv")), -1.0 + ).optimize() + + power_generic = pynutil.add_weight( + (pynutil.delete("^") + pynutil.insert(" टु द पावर ") + num_graph), 1.0 + ).optimize() + + power_suffix = pynini.union(power_special, power_generic).optimize() + power_graph = num_graph + power_suffix + serial_graph = pynini.union(serial_graph, power_graph).optimize() + + serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2), serial_graph).optimize() + + pure_word_slash = pynini.closure(NEMO_ALPHA, 1) + pynini.accep("/") + pynini.closure(NEMO_ALPHA, 1) + + dimension_pattern = ( + pynini.closure(any_digit, 1) + (pynini.accep("x") | pynini.accep("X")) + pynini.closure(any_digit, 1) + ) + + _opt_delim = pynini.closure(pynini.accep("-") | pynini.accep(" "), 0, 1) + latin_alphanum = (pynini.closure(NEMO_ALPHA, 1) + _opt_delim + pynini.closure(any_digit, 1)) | ( + pynini.closure(any_digit, 1) + _opt_delim + pynini.closure(NEMO_ALPHA, 1) + ) + + ordinal_suffixes = pynini.project( + pynini.union( + pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv")), + pynini.string_file(get_abs_path("data/ordinal/suffixes_map.tsv")), + ), + "input", + ).optimize() + ordinal_pattern = pynini.closure(any_digit, 1) + ordinal_suffixes + + date_year_suffix = pynini.project( + pynini.string_file(get_abs_path("data/date/year_suffix.tsv")), + "input", + ).optimize() + date_suffixes = pynini.project( + pynini.string_file(get_abs_path("data/date/suffixes.tsv")), + "input", + ).optimize() + date_pattern = ( + pynini.closure(any_digit, 1) + + pynini.closure(pynini.accep("-") + pynini.closure(any_digit, 1), 0) + + pynini.accep(" ") + + pynini.union(date_year_suffix, date_suffixes) + ) + + exclusions = pure_word_slash | dimension_pattern | latin_alphanum | ordinal_pattern | date_pattern + accepted_inputs = pynini.difference(NEMO_SIGMA, exclusions).optimize() + + serial_graph = pynini.compose(accepted_inputs, serial_graph).optimize() + + self.graph = serial_graph.optimize() + graph = pynutil.insert('name: "') + convert_space(self.graph).optimize() + pynutil.insert('"') + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 88cb04727..75663ca24 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -35,6 +35,7 @@ from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst from nemo_text_processing.text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.hi.taggers.serial import SerialFst from nemo_text_processing.text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst from nemo_text_processing.text_normalization.hi.taggers.whitelist import WhiteListFst @@ -111,12 +112,18 @@ def __init__( punctuation = PunctuationFst(deterministic=deterministic) punct_graph = punctuation.fst + word = WordFst(punctuation=punctuation, deterministic=deterministic) + word_graph = word.fst + telephone = TelephoneFst() telephone_graph = telephone.fst electronic = ElectronicFst(deterministic=deterministic) electronic_graph = electronic.fst + serial = SerialFst(cardinal=cardinal, deterministic=deterministic) + serial_graph = serial.fst + classify = ( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(cardinal_graph, 1.1) @@ -129,10 +136,9 @@ def __init__( | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(electronic_graph, 1.1) + | pynutil.add_weight(serial_graph, 1.11) ) - word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).fst - punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") punct = pynini.closure( pynini.union( diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_serial.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_serial.txt new file mode 100644 index 000000000..4c3880fb9 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_serial.txt @@ -0,0 +1,21 @@ +कोविड-19~कोविड-उन्नीस +कोविड-१९~कोविड-उन्नीस +5जी~पाँच जी +५जी~पाँच जी +2^2~दो स्क्वेर्ड +२^२~दो स्क्वेर्ड +1-800-555~एक-आठ सौ-पाँच सौ पचपन +3जी~तीन जी +4जी~चार जी +कोरोना-2~कोरोना-दो +अग्नि-5~अग्नि-पाँच +ओमिक्रॉन-2~ओमिक्रॉन-दो +3^2~तीन स्क्वेर्ड +2^3~दो क्यूब +5^3~पाँच क्यूब +४^५~चार टु द पावर पाँच +99-1~निन्यानबे-एक +10-20-30~दस-बीस-तीस +1-800-999~एक-आठ सौ-नौ सौ निन्यानबे +पृथ्वी-4~पृथ्वी-चार +ब्रह्मोस-1~ब्रह्मोस-एक \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_serial.py b/tests/nemo_text_processing/hi/test_serial.py new file mode 100644 index 000000000..43da54b17 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_serial.py @@ -0,0 +1,33 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestSerial: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) + + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_serial.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) + assert pred == expected diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh index e8057a126..974dac331 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh @@ -52,10 +52,10 @@ testTNDecimal() { # runtest $input #} -#testTNSerial() { -# input=$PROJECT_DIR/hi/data_text_normalization/test_cases_serial.txt -# runtest $input -#} +testTNSerial() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_serial.txt + runtest $input +} #testTNRoman() { # input=$PROJECT_DIR/en/data_text_normalization/test_cases_roman.txt