From 60425ed252fc09b42f7d9e9ca1cdcdd14c90ebc5 Mon Sep 17 00:00:00 2001
From: pengzhendong <275331498@qq.com>
Date: Wed, 10 Jun 2026 11:45:45 +0800
Subject: [PATCH] feat: English ITN enhancements from NeMo reference

Time:
- Support numeric minutes with "past" (e.g. "ten past four" -> 04:10)
- Add "till" as alias for "to" (e.g. "quarter till two" -> 01:45)
- Add "o' clock" and "o clock" variants
- Zero-pad hours in to_hour.tsv

Telephone:
- Add "triple X" support (e.g. "triple five" -> 555)

Date:
- Add BCE, CE and long-form year suffixes
- Add H1/H2 financial half-year periods
- Add century ("nineteen hundreds" -> 1900s) and millennium ranges
- Add "X hundred" year form (e.g. "nineteen hundred" -> 1900)

Measure:
- Replace cdrewrite with finite string_map for pluralization (fixes OOM)
- Add -ies/-es rules and irregular plurals (feet, inches, ounces)
- Fix "per" unit priority to prefer direct TSV matches (e.g. mph)
---
 itn/english/data/time/to_hour.tsv      | 18 +++++------
 itn/english/rules/date.py              | 36 +++++++++++++++++++---
 itn/english/rules/measure.py           | 42 ++++++++++++++++++--------
 itn/english/rules/telephone.py         | 11 +++++--
 itn/english/rules/time.py              | 18 ++++++-----
 itn/english/test/data/en_date.txt      |  8 +++++
 itn/english/test/data/en_measure.txt   |  2 ++
 itn/english/test/data/en_telephone.txt |  1 +
 itn/english/test/data/en_time.txt      |  7 +++++
 9 files changed, 108 insertions(+), 35 deletions(-)

diff --git a/itn/english/data/time/to_hour.tsv b/itn/english/data/time/to_hour.tsv
index fba67f1b..cddd3d38 100644
--- a/itn/english/data/time/to_hour.tsv
+++ b/itn/english/data/time/to_hour.tsv
@@ -1,12 +1,12 @@
 one	12
-two	1
-three	2
-four	3
-five	4
-six	5
-seven	6
-eight	7
-nine	8
-ten	9
+two	01
+three	02
+four	03
+five	04
+six	05
+seven	06
+eight	07
+nine	08
+ten	09
 eleven	10
 twelve	11
\ No newline at end of file
diff --git a/itn/english/rules/date.py b/itn/english/rules/date.py
index 4804e6e3..6018473e 100644
--- a/itn/english/rules/date.py
+++ b/itn/english/rules/date.py
@@ -72,8 +72,10 @@ def build_tagger(self):
 
         # Year as hundreds: "nineteen oh five" => 1905
         year_hundreds = (teen | two_digit) + ds + oh_digit
+        # Year as "X hundred": "nineteen hundred" => 1900
+        year_xx_hundred = (teen | two_digit) + ds + delete("hundred") + insert("00")
 
-        year_graph = year_two_parts | year_thousands | year_hundreds
+        year_graph = year_two_parts | year_thousands | year_hundreds | year_xx_hundred
 
         # Delete optional "and" within year
         delete_and = self.build_rule(delete("and "), " ", self.ALPHA)
@@ -138,12 +140,38 @@ def build_tagger(self):
             + insert(' year: "') + year_graph + insert('"') + po
         )
 
-        # BC/AD suffix
-        bc_ad = ds + (cross("b c", "BC") | cross("a d", "AD"))
+        # BC/AD/BCE/CE suffix
+        bc_ad = ds + (
+            cross("b c e", "BCE") | cross("before common era", "BCE")
+            | cross("b c", "BC")
+            | cross("c e", "CE") | cross("common era", "CE")
+            | cross("a d", "AD")
+        )
         year_graph_with_3digit = year_graph | year_three_digit
         graph_y_bc = insert('year: "') + year_graph_with_3digit + bc_ad + insert('"') + po
 
-        final_graph = graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y | graph_decade | graph_quarter | graph_y_bc
+        # Half: "first half of twenty twenty two" => H1 2022
+        half_num = cross("first", "1") | cross("second", "2")
+        graph_half = (
+            insert('day: "H') + half_num + insert('"')
+            + ds + delete("half") + ds + delete("of") + ds
+            + insert(' year: "') + year_graph + insert('"') + po
+        )
+
+        # Century: "nineteen hundreds" => 1900s
+        graph_century = (
+            insert('year: "') + (teen | two_digit) + ds + cross("hundreds", "00s") + insert('"') + po
+        )
+        # Millennium: "two thousands" => 2000s
+        graph_millennium = (
+            insert('year: "') + cross("two", "2") + ds + cross("thousands", "000s") + insert('"') + po
+        )
+
+        final_graph = (
+            graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y
+            | graph_decade | graph_quarter | graph_half | graph_y_bc
+            | graph_century | graph_millennium
+        )
         self.tagger = self.add_tokens(final_graph)
 
     def build_verbalizer(self):
diff --git a/itn/english/rules/measure.py b/itn/english/rules/measure.py
index 909c7990..b910e4a6 100644
--- a/itn/english/rules/measure.py
+++ b/itn/english/rules/measure.py
@@ -14,7 +14,7 @@
 
 import pynini
 from pynini import closure, cross, invert, string_file
-from pynini.lib.pynutil import delete, insert
+from pynini.lib.pynutil import add_weight, delete, insert
 
 from itn.english.rules.cardinal import Cardinal
 from itn.english.rules.decimal import Decimal
@@ -35,24 +35,40 @@ def build_tagger(self):
         ds = delete(" ")
 
         # Load measurements: symbol\tname, invert to get name -> symbol
-        units_graph = invert(
-            string_file(get_abs_path("../itn/english/data/measurements.tsv"))
-        )
+        tsv_path = get_abs_path("../itn/english/data/measurements.tsv")
+        units_graph = invert(string_file(tsv_path))
+
+        # Handle plurals: generate plural->symbol mappings from the singular TSV entries
+        # Uses finite string_map instead of cdrewrite to avoid slow runtime compose
+        singular_names = {}
+        with open(tsv_path, encoding="utf-8") as f:
+            for line in f:
+                parts = line.strip().split("\t", 1)
+                if len(parts) == 2:
+                    singular_names.setdefault(parts[1], parts[0])
+
+        plural_pairs = []
+        irregular_plurals = {
+            "foot": "feet", "inch": "inches",
+            "ounce": "ounces",
+        }
+        for name, symbol in singular_names.items():
+            if name in irregular_plurals:
+                plural_pairs.append((irregular_plurals[name], symbol))
+            elif name.endswith(("s", "z", "sh", "ch", "x")):
+                plural_pairs.append((name + "es", symbol))
+            elif name.endswith("y") and len(name) > 1 and name[-2] not in "aeiou":
+                plural_pairs.append((name[:-1] + "ies", symbol))
+            else:
+                plural_pairs.append((name + "s", symbol))
 
-        # Handle plurals: strip trailing "s" to match singular form
-        # e.g. "meters" -> "meter" -> "m", "kilograms" -> "kilogram" -> "kg"
-        depluralize = pynini.cdrewrite(
-            cross("s", ""), "", "[EOS]", self.VSIGMA
-        )
-        # Handle irregular plurals: "feet" -> "foot"
-        irregular = pynini.string_map([("feet", "foot")])
         unit_singular = units_graph
-        unit_plural = (depluralize | irregular) @ units_graph
+        unit_plural = pynini.string_map(plural_pairs)
 
         unit = unit_singular | unit_plural
 
         # Handle "per" units: "per hour" -> "/h"
-        per_unit = insert("/") + delete("per") + ds + unit_singular
+        per_unit = add_weight(insert("/") + delete("per") + ds + unit_singular, 1)
         full_unit = unit + closure(ds + per_unit, 0, 1) | per_unit
 
         # Cardinal value
diff --git a/itn/english/rules/telephone.py b/itn/english/rules/telephone.py
index 9576c262..51c05922 100644
--- a/itn/english/rules/telephone.py
+++ b/itn/english/rules/telephone.py
@@ -40,11 +40,17 @@ def build_tagger(self):
                                       ("five","5"),("six","6"),("seven","7"),("eight","8"),
                                       ("nine","9"),("zero","0"),("oh","0"),("o","0")]])
 
+        # "triple X" => XXX
+        triple = union(*[cross(f"triple {w}", f"{d}{d}{d}")
+                         for w, d in [("one","1"),("two","2"),("three","3"),("four","4"),
+                                      ("five","5"),("six","6"),("seven","7"),("eight","8"),
+                                      ("nine","9"),("zero","0"),("oh","0"),("o","0")]])
+
         # two-digit cardinal: twenty three => 23 (uses graph_two_digit for proper space handling)
         two_digit = self.cardinal.graph_two_digit
 
-        # a token is 1 or 2 digits
-        token = single | double | add_weight(two_digit, 0.002)
+        # a token is 1, 2, or 3 digits
+        token = single | double | triple | add_weight(two_digit, 0.002)
 
         # sequence of tokens separated by spaces
         seq = token + closure(ds + token)
@@ -75,6 +81,7 @@ def build_tagger(self):
         ip_token = (
             single + closure(ds + single, 0, 2)
             | double
+            | triple
             | add_weight(two_digit, 0.002)
             | single + ds + two_digit
             | two_digit + ds + single
diff --git a/itn/english/rules/time.py b/itn/english/rules/time.py
index f2e5d75a..a098573b 100644
--- a/itn/english/rules/time.py
+++ b/itn/english/rules/time.py
@@ -15,6 +15,8 @@
 from pynini import closure, cross, invert, string_file, union
 from pynini.lib.pynutil import add_weight, delete, insert
 
+TO_OR_TILL = union("to", "till")
+
 from itn.english.rules.cardinal import Cardinal
 from tn.processor import Processor
 from tn.utils import get_abs_path
@@ -56,7 +58,7 @@ def build_tagger(self):
         min_single_raw = union(*[cross(_num_to_word(x), str(x)) for x in range(1, 10)])
         min_double_raw = graph_min_double  # already no padding
 
-        oclock = cross("o'clock", "") | cross("oclock", "") | cross("hundred hours", "")
+        oclock = cross("o'clock", "") | cross("o' clock", "") | cross("o clock", "") | cross("oclock", "") | cross("hundred hours", "")
 
         hour = insert('hour: "') + hour_all + insert('"')
         hour12 = insert('hour: "') + hour_12 + insert('"')
@@ -80,23 +82,25 @@ def build_tagger(self):
         graph_o_min_suffix = (
             hour + ds + insert(' minute: "') + delete("o") + ds + graph_min_single + insert('"') + suffix + zone_opt
         )
-        # "half past two", "quarter past two"
+        # "half past two", "quarter past two", "ten past four"
         graph_past = (
-            insert('minute: "') + graph_min_verbose + insert('"') + ds + delete("past") + ds + hour
+            insert('minute: "')
+            + (graph_min_single | graph_min_double | graph_min_verbose)
+            + insert('"') + ds + delete("past") + ds + hour
         )
-        # "quarter to one" => 12:45
+        # "quarter to one" / "quarter till one" => 12:45
         graph_quarter_to = (
             insert('minute: "') + cross("quarter", "45") + insert('"')
-            + ds + delete("to") + ds
+            + ds + delete(TO_OR_TILL) + ds
             + insert('hour: "') + to_hour + insert('"')
         )
-        # "ten to eleven pm" => 10:50 p.m.
+        # "ten to eleven pm" / "ten till eleven pm" => 10:50 p.m.
         graph_min_to = (
             insert('minute: "')
             + ((min_single_raw | min_double_raw) @ minute_to)
             + insert('"')
             + closure(ds + delete("min") + delete("ute").ques + delete("s").ques, 0, 1)
-            + ds + delete("to") + ds
+            + ds + delete(TO_OR_TILL) + ds
             + insert('hour: "') + to_hour + insert('"')
             + suffix
         )
diff --git a/itn/english/test/data/en_date.txt b/itn/english/test/data/en_date.txt
index de5be292..0260a974 100644
--- a/itn/english/test/data/en_date.txt
+++ b/itn/english/test/data/en_date.txt
@@ -34,3 +34,11 @@ nineteen seventy five => 1975
 eleven fifty five => 1155
 second quarter of twenty twenty two => Q2 2022
 seven fifty b c => 750BC
+seven fifty b c e => 750BCE
+nineteen hundred c e => 1900CE
+nineteen hundred a d => 1900AD
+first half of twenty twenty two => H1 2022
+second half of twenty twenty => H2 2020
+nineteen hundreds => 1900s
+twenty hundreds => 2000s
+two thousands => 2000s
diff --git a/itn/english/test/data/en_measure.txt b/itn/english/test/data/en_measure.txt
index 612f31df..5a2cae25 100644
--- a/itn/english/test/data/en_measure.txt
+++ b/itn/english/test/data/en_measure.txt
@@ -110,3 +110,5 @@ eight point five megawatts => 8.5 mW
 eight point five meters => 8.5 m
 eight point five two percent => 8.52 %
 eight point four four percent => 8.44 %
+five ounces => 5 oz
+ten kilo calories => 10 kcal
diff --git a/itn/english/test/data/en_telephone.txt b/itn/english/test/data/en_telephone.txt
index d0130b4c..9111d38d 100644
--- a/itn/english/test/data/en_telephone.txt
+++ b/itn/english/test/data/en_telephone.txt
@@ -21,3 +21,4 @@ r t x forty fifty t i => RTX 4050ti
 four three two double seven three two one four three two one four three double zero five => 432 7732 143214 3005
 a thirty six => a 36
 a ten eighty p display => a 1080p display
+triple five one two three one two three four => 555-123-1234
diff --git a/itn/english/test/data/en_time.txt b/itn/english/test/data/en_time.txt
index 3a049820..3d9abf3b 100644
--- a/itn/english/test/data/en_time.txt
+++ b/itn/english/test/data/en_time.txt
@@ -27,3 +27,10 @@ quarter to one => 12:45
 quarter to twelve => 11:45
 set alarm at ten to eleven pm => set alarm at 10:50 p.m.
 one min to one am => 12:59 a.m.
+ten past four => 04:10
+twenty five past three => 03:25
+five past twelve => 12:05
+quarter till two => 01:45
+ten till four pm => 03:50 p.m.
+three o' clock => 03:00
+three o clock => 03:00