From 0d44e3510c55d804dc84e5d2771d71c77398f591 Mon Sep 17 00:00:00 2001 From: pengzhendong <275331498@qq.com> Date: Thu, 11 Jun 2026 13:45:22 +0800 Subject: [PATCH] fix: English TN no longer inserts spurious space after opening punctuation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The verbalizer's INSERT_SPACE was applied uniformly between all non-punct tokens, causing unwanted spaces after opening quotes and parens (e.g., `"hello"` → `" hello"`). Split the verbalizer pattern so classify tokens use INSERT_SPACE for inter-word spacing while punct tokens use DELETE_SPACE — punct values already carry surrounding spacing via the tagger's add_weight(accep(" "), -1.0).star. Add test cases for the reported issue and related patterns. --- tn/english/normalizer.py | 16 ++++++++++++---- tn/english/test/data/normalizer.txt | 3 +++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/tn/english/normalizer.py b/tn/english/normalizer.py index 1ed7f64..2d0d313 100644 --- a/tn/english/normalizer.py +++ b/tn/english/normalizer.py @@ -78,7 +78,7 @@ def build_tagger_and_verbalizer(self): ).optimize() + (add_weight(punctuation.tagger, 2.00).plus | self.DELETE_SPACE) self.tagger = (delete(" ").star + tagger.star) @ self.build_rule(delete(" "), r="[EOS]") - verbalizer = ( + classify = ( cardinal.verbalizer | ordinal.verbalizer | word.verbalizer @@ -92,7 +92,15 @@ def build_tagger_and_verbalizer(self): | electronic.verbalizer | serial.verbalizer | whitelist.verbalizer - | punctuation.verbalizer | rang.verbalizer - ).optimize() + (punctuation.verbalizer.plus | self.INSERT_SPACE) - self.verbalizer = verbalizer.star @ self.build_rule(delete(" "), r="[EOS]") + ).optimize() + punct = punctuation.verbalizer.optimize() + # Punct tokens carry surrounding spacing in their values (the tagger's + # add_weight(accep(" "), -1.0).star absorbs spaces around punctuation). + # So punct tokens handle their own spacing and don't need INSERT_SPACE. + # Only classify tokens need INSERT_SPACE for inter-word spacing. + verbalizer = ( + classify + (punct.plus | self.INSERT_SPACE) + | punct + (punct.plus | self.DELETE_SPACE) + ).star + self.verbalizer = verbalizer @ self.build_rule(delete(" "), r="[EOS]") diff --git a/tn/english/test/data/normalizer.txt b/tn/english/test/data/normalizer.txt index 7d04728..2deb920 100644 --- a/tn/english/test/data/normalizer.txt +++ b/tn/english/test/data/normalizer.txt @@ -6,3 +6,6 @@ Try searching for 'Toyota' or 'Investment' => Try searching for 'Toyota' or 'Inv "" => "" The HTML tag

defines a paragraph. => The HTML tag

defines a paragraph. hello world => hello world +"So, one hundred thousand merit shouldn't be a problem." => "So, one hundred thousand merit shouldn't be a problem." +"hello" => "hello" +(hello) => (hello)