wenet-e2e · pengzhendong · Jun 10, 2026 · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/itn/english/data/measurements.tsv b/itn/english/data/measurements.tsv
@@ -143,3 +143,7 @@ gy	gray
 sv	sievert
 cwt	hundredweight
 cc	c c
+mph	miles per hour
+sq ft	square feet
+kgf/cm²	kilograms force per square centimeter
+kgf/cm²	kilogram force per square centimeter
diff --git a/itn/english/inverse_normalizer.py b/itn/english/inverse_normalizer.py
@@ -13,12 +13,22 @@
 # limitations under the License.
 
 from importlib_resources import files
+from pynini import closure
 from pynini.lib.pynutil import add_weight, delete
 
 from itn.english.rules.cardinal import Cardinal
 from itn.english.rules.char import Char
+from itn.english.rules.date import Date
 from itn.english.rules.decimal import Decimal
+from itn.english.rules.electronic import Electronic
+from itn.english.rules.measure import Measure
+from itn.english.rules.money import Money
 from itn.english.rules.ordinal import Ordinal
+from itn.english.rules.punctuation import Punctuation
+from itn.english.rules.telephone import Telephone
+from itn.english.rules.time import Time
+from itn.english.rules.whitelist import Whitelist
+from itn.english.rules.word import Word
 from tn.processor import Processor
 
 
@@ -34,23 +44,53 @@ def build_tagger_and_verbalizer(self):
         cardinal = Cardinal()
         ordinal = Ordinal(cardinal=cardinal)
         decimal = Decimal(cardinal=cardinal)
+        date = Date(cardinal=cardinal, ordinal=ordinal)
+        time = Time(cardinal=cardinal)
+        measure = Measure(cardinal=cardinal, decimal=decimal)
+        money = Money(cardinal=cardinal, decimal=decimal)
+        telephone = Telephone(cardinal=cardinal)
+        electronic = Electronic()
+        whitelist = Whitelist()
+        word = Word()
         char = Char()
+        punctuation = Punctuation()
 
-        tagger = (
-            add_weight(ordinal.tagger, 1.0)
-            | add_weight(decimal.tagger, 1.01)
-            | add_weight(cardinal.tagger, 1.02)
+        classify = (
+            add_weight(date.tagger, 1.09)
+            | add_weight(time.tagger, 1.1)
+            | add_weight(measure.tagger, 1.1)
+            | add_weight(money.tagger, 1.08)
+            | add_weight(whitelist.tagger, 1.01)
+            | add_weight(telephone.tagger, 1.1)
+            | add_weight(electronic.tagger, 1.1)
+            | add_weight(ordinal.tagger, 1.09)
+            | add_weight(decimal.tagger, 1.1)
+            | add_weight(cardinal.tagger, 1.1)
+            | add_weight(word.tagger, 50)
             | add_weight(char.tagger, 100)
         ).optimize()
 
-        tagger = tagger.star
-        self.tagger = tagger @ self.build_rule(delete(" "), "", "[EOS]")
+        punct = add_weight(punctuation.tagger, 1.1)
+        token = closure(punct + delete(" ").ques) + classify + closure(delete(" ").ques + punct)
+        graph = token + closure(self.DELETE_EXTRA_SPACE + token)
+        self.tagger = delete(" ").star + graph + delete(" ").star
 
         verbalizer = (
             cardinal.verbalizer
             | ordinal.verbalizer
             | decimal.verbalizer
+            | date.verbalizer
+            | time.verbalizer
+            | measure.verbalizer
+            | money.verbalizer
+            | telephone.verbalizer
+            | electronic.verbalizer
+            | whitelist.verbalizer
+            | word.verbalizer
             | char.verbalizer
+            | punctuation.verbalizer
         ).optimize()
 
-        self.verbalizer = verbalizer.star
+        self.verbalizer = (verbalizer + self.INSERT_SPACE).star @ self.build_rule(
+            self.DELETE_EXTRA_SPACE
+        ) @ self.build_rule(delete(" "), r="[EOS]")
diff --git a/itn/english/rules/cardinal.py b/itn/english/rules/cardinal.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pynini import closure, cross, string_file, union
-from pynini.lib.pynutil import delete, insert
+from pynini import closure, cross, difference, string_file, union
+from pynini.lib.pynutil import add_weight, delete, insert
 
 from tn.processor import Processor
 from tn.utils import get_abs_path
@@ -35,7 +35,8 @@ def build_tagger(self):
 
         # 1~9, 10~19, 20~99
         one_digit = digit
-        two_digit = teen | (ties + (ds + digit | insert("0")))
+        two_digit = teen | (ties + (ds + digit | add_weight(insert("0"), 0.1)))
+        self.graph_two_digit = two_digit
         up_to_99 = one_digit | two_digit
 
         # one hundred, one hundred twenty three, one hundred one
@@ -47,6 +48,7 @@ def build_tagger(self):
 
         # 1~999
         up_to_999 = up_to_99 | hundreds
+        self.up_to_999 = up_to_999
         # 1~999 with zero-padding to 3 digits
         up_to_999_padded = hundreds | insert("0") + two_digit | insert("00") + one_digit
 
@@ -111,10 +113,17 @@ def _with_mag_padded(name):
         graph = (delete_and @ graph).optimize()
 
         self.graph = graph
+        self.graph_no_exception = graph
+
+        # exclude 0-12 from cardinal tagger (they stay as words)
+        from itn.english.rules.time import _num_to_word
+        exception_labels = ["zero"] + [_num_to_word(x) for x in range(1, 13)]
+        exception = union(*exception_labels).optimize()
+        graph_with_exception = (difference(self.VSIGMA, exception) @ graph).optimize()
 
         minus = delete("minus") | delete("negative")
         optional_minus = closure(insert('negative: "-" ') + minus + ds, 0, 1)
-        final_graph = optional_minus + insert('integer: "') + graph + insert('"')
+        final_graph = optional_minus + insert('integer: "') + graph_with_exception + insert('"')
         self.tagger = self.add_tokens(final_graph)
 
     def build_verbalizer(self):

diff --git a/itn/english/rules/date.py b/itn/english/rules/date.py
@@ -56,6 +56,8 @@ def build_tagger(self):
 
         # Year as two groups of two digits: "twenty twelve" => 2012
         year_two_parts = (teen | two_digit) + ds + (two_digit | oh_digit | teen)
+        # 3-digit year: "seven fifty" => 750
+        year_three_digit = digit + ds + (two_digit | oh_digit | teen)
 
         # Year as "X thousand Y": "two thousand twelve" => 2012
         # Need zero-padded variants so "two thousand three" => 2003
@@ -116,9 +118,32 @@ def build_tagger(self):
             + po
         )
         # Year only => "twenty twelve", "two thousand three"
-        graph_y = add_weight(year, 0.01) + po
+        graph_y = year + po
 
-        final_graph = graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y
+        # Decades: "nineteen eighties" => 1980s
+        decade_suffix = closure(self.ALPHA, 1) + (cross("ies", "y") | delete("s"))
+        decade_word = pynini.compose(decade_suffix, ties | cross("ten", "10"))
+        graph_decade = (
+            insert('year: "') + (teen | two_digit) + ds + decade_word + insert('0s"') + po
+        )
+
+        # Quarter: "second quarter of twenty twenty two" => Q2 2022
+        quarter_num = (
+            cross("first", "1") | cross("second", "2")
+            | cross("third", "3") | cross("fourth", "4")
+        )
+        graph_quarter = (
+            insert('day: "Q') + quarter_num + insert('"')
+            + ds + delete("quarter") + ds + delete("of") + ds
+            + insert(' year: "') + year_graph + insert('"') + po
+        )
+
+        # BC/AD suffix
+        bc_ad = ds + (cross("b c", "BC") | cross("a d", "AD"))
+        year_graph_with_3digit = year_graph | year_three_digit
+        graph_y_bc = insert('year: "') + year_graph_with_3digit + bc_ad + insert('"') + po
+
+        final_graph = graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y | graph_decade | graph_quarter | graph_y_bc
         self.tagger = self.add_tokens(final_graph)
 
     def build_verbalizer(self):
@@ -160,6 +185,8 @@ def build_verbalizer(self):
         graph_dmy = day + self.DELETE_SPACE + insert(" ") + month + optional_year
         # year only
         graph_y = year
+        # day + year (for quarter: Q2 2022)
+        graph_dy = day + self.DELETE_SPACE + insert(" ") + year
 
-        graph = (graph_mdy | graph_dmy | graph_y) + self.DELETE_SPACE + delete_po
+        graph = (graph_mdy | graph_dmy | graph_dy | graph_y) + self.DELETE_SPACE + delete_po
         self.verbalizer = self.delete_tokens(graph)
diff --git a/itn/english/rules/decimal.py b/itn/english/rules/decimal.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pynini import closure, cross, string_file
+from pynini import closure, cross, string_file, union
 from pynini.lib.pynutil import delete, insert
 
 from itn.english.rules.cardinal import Cardinal
 from tn.processor import Processor
-from tn.utils import get_abs_path
+from tn.utils import get_abs_path, load_labels
 
 
 class Decimal(Processor):
@@ -45,6 +45,25 @@ def build_tagger(self):
         point = delete("point")
 
         graph = optional_negative + closure(integer_part + ds, 0, 1) + point + ds + frac_part
+
+        # quantity: "five point two million" => 5.2 million
+        quantities = load_labels(get_abs_path("../itn/english/data/numbers/thousands.tsv"))
+        quantity_all = union(*[x[0] for x in quantities])
+        quantity_no_thousand = union(*[x[0] for x in quantities if x[0] != "thousand"])
+        # decimal + quantity: five point two million, 164.58 thousand
+        quantity_graph = (
+            optional_negative + integer_part + ds + point + ds + frac_part
+            + ds + insert(' quantity: "') + quantity_all + insert('"')
+        )
+        # cardinal (up to 999) + quantity: four hundred million, five million
+        # exclude thousand to let cardinal handle "ten thousand" => 10000
+        cardinal_small = self.cardinal.up_to_999
+        cardinal_quantity = (
+            optional_negative + insert('integer_part: "') + cardinal_small + insert('"')
+            + ds + insert(' quantity: "') + quantity_no_thousand + insert('"')
+        )
+        graph |= quantity_graph | cardinal_quantity
+
         self.tagger = self.add_tokens(graph)
 
     def build_verbalizer(self):
@@ -56,6 +75,11 @@ def build_verbalizer(self):
             + delete('"') + self.NOT_QUOTE.plus + delete('"')
         )
         optional_fractional = closure(fractional + self.DELETE_SPACE, 0, 1)
-        graph = optional_sign + optional_integer + optional_fractional
+        quantity = (
+            insert(" ") + delete('quantity:') + self.DELETE_SPACE
+            + delete('"') + self.NOT_QUOTE.plus + delete('"')
+        )
+        optional_quantity = closure(quantity + self.DELETE_SPACE, 0, 1)
+        graph = optional_sign + optional_integer + optional_fractional + optional_quantity
         self.numbers = graph
         self.verbalizer = self.delete_tokens(graph)
diff --git a/itn/english/rules/electronic.py b/itn/english/rules/electronic.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pynini import closure, cross, invert, string_file
+from pynini import accep, closure, cross, difference, invert, string_file
 from pynini.lib.pynutil import add_weight, delete, insert
 
 from tn.processor import Processor
@@ -28,90 +28,49 @@ def __init__(self):
 
     def build_tagger(self):
         ds = delete(" ")
-
-        # Single characters: digits and letters
         digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv"))
         zero = string_file(get_abs_path("../itn/english/data/numbers/zero.tsv"))
-        alpha_or_digit = self.ALPHA | digit | zero
-
-        # Symbols from TSV (symbol\tname): invert to get name -> symbol
-        symbols = invert(
-            string_file(get_abs_path("../itn/english/data/electronic/symbols.tsv"))
-        )
+        symbols = invert(string_file(get_abs_path("../itn/english/data/electronic/symbols.tsv")))
 
-        # A "token" is either a single char (letter/digit/symbol) or a
-        # multi-letter word kept verbatim (e.g. "gmail", "nvidia").
-        # Multi-letter words have lower priority so spelled-out letters are preferred.
-        word = add_weight(closure(self.ALPHA, 2), 0.01)
-        token = alpha_or_digit | symbols | word
+        char = self.ALPHA | digit | zero
+        word = add_weight(closure(self.ALPHA, 2), 0.1)
+        token = char | symbols | word
+        first_token = char | difference(word, accep("dot"))
+        component = first_token + closure(ds + token)
 
-        # A component is one or more tokens separated by spaces
-        component = token + closure(ds + token)
+        dot = cross("dot", ".")
+        domain = component + (ds + dot + ds + component).plus
 
         username = insert('username: "') + component + insert('"')
+        domain_field = insert('domain: "') + domain + insert('"')
 
-        # Domain: component(s) separated by "dot" => "."
-        dot = cross("dot", ".")
-        domain_content = component + closure(ds + dot + ds + component)
-        domain = insert('domain: "') + domain_content + insert('"')
-
-        # Email: username at domain
-        graph_email = (
-            username
-            + ds
-            + delete("at")
-            + ds
-            + insert(" ")
-            + domain
-        )
-
-        # URL protocol: "h t t p colon slash slash" or "h t t p s colon slash slash"
+        # Email: X at Y dot Z (requires "at" keyword)
+        graph_email = username + ds + delete("at") + ds + insert(" ") + domain_field
+
+        # URL: requires protocol or www prefix
         http = cross("h t t p", "http")
         https = cross("h t t p s", "https")
-        colon_slash_slash = cross(" colon slash slash ", "://")
-        protocol_start = (http | https) + colon_slash_slash
-
-        # www prefix
+        protocol = (http | https) + cross(" colon slash slash ", "://")
         www = cross("w w w", "www")
 
-        # URL: [protocol] [www.] domain
-        url_content = (
-            closure(protocol_start, 0, 1)
-            + closure(www + ds + dot + ds, 0, 1)
-            + domain_content
-        )
-        graph_url = insert('protocol: "') + url_content + insert('"')
+        # protocol + [www.] + domain
+        url_with_protocol = protocol + closure(www + ds + dot + ds, 0, 1) + domain
+        # www. + domain (no protocol)
+        url_with_www = www + ds + dot + ds + domain
+        # domain only (must have dot): nvidia dot com
+        url_domain_only = domain
+
+        graph_url = insert('protocol: "') + (url_with_protocol | url_with_www | url_domain_only) + insert('"')
 
         final_graph = graph_email | graph_url
         self.tagger = self.add_tokens(final_graph)
 
     def build_verbalizer(self):
-        username = (
-            delete("username:")
-            + self.DELETE_SPACE
-            + delete('"')
-            + self.NOT_QUOTE.plus
-            + delete('"')
-        )
-        domain = (
-            delete("domain:")
-            + self.DELETE_SPACE
-            + delete('"')
-            + self.NOT_QUOTE.plus
-            + delete('"')
-        )
-        protocol = (
-            delete("protocol:")
-            + self.DELETE_SPACE
-            + delete('"')
-            + self.NOT_QUOTE.plus
-            + delete('"')
-        )
-
-        # Email: username@domain
+        username = delete("username:") + self.DELETE_SPACE + delete('"') + self.NOT_QUOTE.plus + delete('"')
+        domain = delete("domain:") + self.DELETE_SPACE + delete('"') + self.NOT_QUOTE.plus + delete('"')
+        protocol = delete("protocol:") + self.DELETE_SPACE + delete('"') + self.NOT_QUOTE.plus + delete('"')
+
         graph_email = username + self.DELETE_SPACE + insert("@") + domain
-        # URL: just output the protocol content directly
         graph_url = protocol
 
-        graph = graph_email | graph_url
-        self.verbalizer = self.delete_tokens(graph)
+        self.verbalizer = self.delete_tokens(graph_email | graph_url)