From 60425ed252fc09b42f7d9e9ca1cdcdd14c90ebc5 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Wed, 10 Jun 2026 11:45:45 +0800 Subject: [PATCH] feat: English ITN enhancements from NeMo reference Time: - Support numeric minutes with "past" (e.g. "ten past four" -> 04:10) - Add "till" as alias for "to" (e.g. "quarter till two" -> 01:45) - Add "o' clock" and "o clock" variants - Zero-pad hours in to_hour.tsv Telephone: - Add "triple X" support (e.g. "triple five" -> 555) Date: - Add BCE, CE and long-form year suffixes - Add H1/H2 financial half-year periods - Add century ("nineteen hundreds" -> 1900s) and millennium ranges - Add "X hundred" year form (e.g. "nineteen hundred" -> 1900) Measure: - Replace cdrewrite with finite string_map for pluralization (fixes OOM) - Add -ies/-es rules and irregular plurals (feet, inches, ounces) - Fix "per" unit priority to prefer direct TSV matches (e.g. mph) --- itn/english/data/time/to_hour.tsv | 18 +++++------ itn/english/rules/date.py | 36 +++++++++++++++++++--- itn/english/rules/measure.py | 42 ++++++++++++++++++-------- itn/english/rules/telephone.py | 11 +++++-- itn/english/rules/time.py | 18 ++++++----- itn/english/test/data/en_date.txt | 8 +++++ itn/english/test/data/en_measure.txt | 2 ++ itn/english/test/data/en_telephone.txt | 1 + itn/english/test/data/en_time.txt | 7 +++++ 9 files changed, 108 insertions(+), 35 deletions(-) diff --git a/itn/english/data/time/to_hour.tsv b/itn/english/data/time/to_hour.tsv index fba67f1b..cddd3d38 100644 --- a/itn/english/data/time/to_hour.tsv +++ b/itn/english/data/time/to_hour.tsv @@ -1,12 +1,12 @@ one 12 -two 1 -three 2 -four 3 -five 4 -six 5 -seven 6 -eight 7 -nine 8 -ten 9 +two 01 +three 02 +four 03 +five 04 +six 05 +seven 06 +eight 07 +nine 08 +ten 09 eleven 10 twelve 11 \ No newline at end of file diff --git a/itn/english/rules/date.py b/itn/english/rules/date.py index 4804e6e3..6018473e 100644 --- a/itn/english/rules/date.py +++ b/itn/english/rules/date.py @@ -72,8 +72,10 @@ def build_tagger(self): # Year as hundreds: "nineteen oh five" => 1905 year_hundreds = (teen | two_digit) + ds + oh_digit + # Year as "X hundred": "nineteen hundred" => 1900 + year_xx_hundred = (teen | two_digit) + ds + delete("hundred") + insert("00") - year_graph = year_two_parts | year_thousands | year_hundreds + year_graph = year_two_parts | year_thousands | year_hundreds | year_xx_hundred # Delete optional "and" within year delete_and = self.build_rule(delete("and "), " ", self.ALPHA) @@ -138,12 +140,38 @@ def build_tagger(self): + insert(' year: "') + year_graph + insert('"') + po ) - # BC/AD suffix - bc_ad = ds + (cross("b c", "BC") | cross("a d", "AD")) + # BC/AD/BCE/CE suffix + bc_ad = ds + ( + cross("b c e", "BCE") | cross("before common era", "BCE") + | cross("b c", "BC") + | cross("c e", "CE") | cross("common era", "CE") + | cross("a d", "AD") + ) year_graph_with_3digit = year_graph | year_three_digit graph_y_bc = insert('year: "') + year_graph_with_3digit + bc_ad + insert('"') + po - final_graph = graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y | graph_decade | graph_quarter | graph_y_bc + # Half: "first half of twenty twenty two" => H1 2022 + half_num = cross("first", "1") | cross("second", "2") + graph_half = ( + insert('day: "H') + half_num + insert('"') + + ds + delete("half") + ds + delete("of") + ds + + insert(' year: "') + year_graph + insert('"') + po + ) + + # Century: "nineteen hundreds" => 1900s + graph_century = ( + insert('year: "') + (teen | two_digit) + ds + cross("hundreds", "00s") + insert('"') + po + ) + # Millennium: "two thousands" => 2000s + graph_millennium = ( + insert('year: "') + cross("two", "2") + ds + cross("thousands", "000s") + insert('"') + po + ) + + final_graph = ( + graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y + | graph_decade | graph_quarter | graph_half | graph_y_bc + | graph_century | graph_millennium + ) self.tagger = self.add_tokens(final_graph) def build_verbalizer(self): diff --git a/itn/english/rules/measure.py b/itn/english/rules/measure.py index 909c7990..b910e4a6 100644 --- a/itn/english/rules/measure.py +++ b/itn/english/rules/measure.py @@ -14,7 +14,7 @@ import pynini from pynini import closure, cross, invert, string_file -from pynini.lib.pynutil import delete, insert +from pynini.lib.pynutil import add_weight, delete, insert from itn.english.rules.cardinal import Cardinal from itn.english.rules.decimal import Decimal @@ -35,24 +35,40 @@ def build_tagger(self): ds = delete(" ") # Load measurements: symbol\tname, invert to get name -> symbol - units_graph = invert( - string_file(get_abs_path("../itn/english/data/measurements.tsv")) - ) + tsv_path = get_abs_path("../itn/english/data/measurements.tsv") + units_graph = invert(string_file(tsv_path)) + + # Handle plurals: generate plural->symbol mappings from the singular TSV entries + # Uses finite string_map instead of cdrewrite to avoid slow runtime compose + singular_names = {} + with open(tsv_path, encoding="utf-8") as f: + for line in f: + parts = line.strip().split("\t", 1) + if len(parts) == 2: + singular_names.setdefault(parts[1], parts[0]) + + plural_pairs = [] + irregular_plurals = { + "foot": "feet", "inch": "inches", + "ounce": "ounces", + } + for name, symbol in singular_names.items(): + if name in irregular_plurals: + plural_pairs.append((irregular_plurals[name], symbol)) + elif name.endswith(("s", "z", "sh", "ch", "x")): + plural_pairs.append((name + "es", symbol)) + elif name.endswith("y") and len(name) > 1 and name[-2] not in "aeiou": + plural_pairs.append((name[:-1] + "ies", symbol)) + else: + plural_pairs.append((name + "s", symbol)) - # Handle plurals: strip trailing "s" to match singular form - # e.g. "meters" -> "meter" -> "m", "kilograms" -> "kilogram" -> "kg" - depluralize = pynini.cdrewrite( - cross("s", ""), "", "[EOS]", self.VSIGMA - ) - # Handle irregular plurals: "feet" -> "foot" - irregular = pynini.string_map([("feet", "foot")]) unit_singular = units_graph - unit_plural = (depluralize | irregular) @ units_graph + unit_plural = pynini.string_map(plural_pairs) unit = unit_singular | unit_plural # Handle "per" units: "per hour" -> "/h" - per_unit = insert("/") + delete("per") + ds + unit_singular + per_unit = add_weight(insert("/") + delete("per") + ds + unit_singular, 1) full_unit = unit + closure(ds + per_unit, 0, 1) | per_unit # Cardinal value diff --git a/itn/english/rules/telephone.py b/itn/english/rules/telephone.py index 9576c262..51c05922 100644 --- a/itn/english/rules/telephone.py +++ b/itn/english/rules/telephone.py @@ -40,11 +40,17 @@ def build_tagger(self): ("five","5"),("six","6"),("seven","7"),("eight","8"), ("nine","9"),("zero","0"),("oh","0"),("o","0")]]) + # "triple X" => XXX + triple = union(*[cross(f"triple {w}", f"{d}{d}{d}") + for w, d in [("one","1"),("two","2"),("three","3"),("four","4"), + ("five","5"),("six","6"),("seven","7"),("eight","8"), + ("nine","9"),("zero","0"),("oh","0"),("o","0")]]) + # two-digit cardinal: twenty three => 23 (uses graph_two_digit for proper space handling) two_digit = self.cardinal.graph_two_digit - # a token is 1 or 2 digits - token = single | double | add_weight(two_digit, 0.002) + # a token is 1, 2, or 3 digits + token = single | double | triple | add_weight(two_digit, 0.002) # sequence of tokens separated by spaces seq = token + closure(ds + token) @@ -75,6 +81,7 @@ def build_tagger(self): ip_token = ( single + closure(ds + single, 0, 2) | double + | triple | add_weight(two_digit, 0.002) | single + ds + two_digit | two_digit + ds + single diff --git a/itn/english/rules/time.py b/itn/english/rules/time.py index f2e5d75a..a098573b 100644 --- a/itn/english/rules/time.py +++ b/itn/english/rules/time.py @@ -15,6 +15,8 @@ from pynini import closure, cross, invert, string_file, union from pynini.lib.pynutil import add_weight, delete, insert +TO_OR_TILL = union("to", "till") + from itn.english.rules.cardinal import Cardinal from tn.processor import Processor from tn.utils import get_abs_path @@ -56,7 +58,7 @@ def build_tagger(self): min_single_raw = union(*[cross(_num_to_word(x), str(x)) for x in range(1, 10)]) min_double_raw = graph_min_double # already no padding - oclock = cross("o'clock", "") | cross("oclock", "") | cross("hundred hours", "") + oclock = cross("o'clock", "") | cross("o' clock", "") | cross("o clock", "") | cross("oclock", "") | cross("hundred hours", "") hour = insert('hour: "') + hour_all + insert('"') hour12 = insert('hour: "') + hour_12 + insert('"') @@ -80,23 +82,25 @@ def build_tagger(self): graph_o_min_suffix = ( hour + ds + insert(' minute: "') + delete("o") + ds + graph_min_single + insert('"') + suffix + zone_opt ) - # "half past two", "quarter past two" + # "half past two", "quarter past two", "ten past four" graph_past = ( - insert('minute: "') + graph_min_verbose + insert('"') + ds + delete("past") + ds + hour + insert('minute: "') + + (graph_min_single | graph_min_double | graph_min_verbose) + + insert('"') + ds + delete("past") + ds + hour ) - # "quarter to one" => 12:45 + # "quarter to one" / "quarter till one" => 12:45 graph_quarter_to = ( insert('minute: "') + cross("quarter", "45") + insert('"') - + ds + delete("to") + ds + + ds + delete(TO_OR_TILL) + ds + insert('hour: "') + to_hour + insert('"') ) - # "ten to eleven pm" => 10:50 p.m. + # "ten to eleven pm" / "ten till eleven pm" => 10:50 p.m. graph_min_to = ( insert('minute: "') + ((min_single_raw | min_double_raw) @ minute_to) + insert('"') + closure(ds + delete("min") + delete("ute").ques + delete("s").ques, 0, 1) - + ds + delete("to") + ds + + ds + delete(TO_OR_TILL) + ds + insert('hour: "') + to_hour + insert('"') + suffix ) diff --git a/itn/english/test/data/en_date.txt b/itn/english/test/data/en_date.txt index de5be292..0260a974 100644 --- a/itn/english/test/data/en_date.txt +++ b/itn/english/test/data/en_date.txt @@ -34,3 +34,11 @@ nineteen seventy five => 1975 eleven fifty five => 1155 second quarter of twenty twenty two => Q2 2022 seven fifty b c => 750BC +seven fifty b c e => 750BCE +nineteen hundred c e => 1900CE +nineteen hundred a d => 1900AD +first half of twenty twenty two => H1 2022 +second half of twenty twenty => H2 2020 +nineteen hundreds => 1900s +twenty hundreds => 2000s +two thousands => 2000s diff --git a/itn/english/test/data/en_measure.txt b/itn/english/test/data/en_measure.txt index 612f31df..5a2cae25 100644 --- a/itn/english/test/data/en_measure.txt +++ b/itn/english/test/data/en_measure.txt @@ -110,3 +110,5 @@ eight point five megawatts => 8.5 mW eight point five meters => 8.5 m eight point five two percent => 8.52 % eight point four four percent => 8.44 % +five ounces => 5 oz +ten kilo calories => 10 kcal diff --git a/itn/english/test/data/en_telephone.txt b/itn/english/test/data/en_telephone.txt index d0130b4c..9111d38d 100644 --- a/itn/english/test/data/en_telephone.txt +++ b/itn/english/test/data/en_telephone.txt @@ -21,3 +21,4 @@ r t x forty fifty t i => RTX 4050ti four three two double seven three two one four three two one four three double zero five => 432 7732 143214 3005 a thirty six => a 36 a ten eighty p display => a 1080p display +triple five one two three one two three four => 555-123-1234 diff --git a/itn/english/test/data/en_time.txt b/itn/english/test/data/en_time.txt index 3a049820..3d9abf3b 100644 --- a/itn/english/test/data/en_time.txt +++ b/itn/english/test/data/en_time.txt @@ -27,3 +27,10 @@ quarter to one => 12:45 quarter to twelve => 11:45 set alarm at ten to eleven pm => set alarm at 10:50 p.m. one min to one am => 12:59 a.m. +ten past four => 04:10 +twenty five past three => 03:25 +five past twelve => 12:05 +quarter till two => 01:45 +ten till four pm => 03:50 p.m. +three o' clock => 03:00 +three o clock => 03:00