Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions itn/english/data/measurements.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,7 @@ gy gray
sv sievert
cwt hundredweight
cc c c
mph miles per hour
sq ft square feet
kgf/cm² kilograms force per square centimeter
kgf/cm² kilogram force per square centimeter
54 changes: 47 additions & 7 deletions itn/english/inverse_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,22 @@
# limitations under the License.

from importlib_resources import files
from pynini import closure
from pynini.lib.pynutil import add_weight, delete

from itn.english.rules.cardinal import Cardinal
from itn.english.rules.char import Char
from itn.english.rules.date import Date
from itn.english.rules.decimal import Decimal
from itn.english.rules.electronic import Electronic
from itn.english.rules.measure import Measure
from itn.english.rules.money import Money
from itn.english.rules.ordinal import Ordinal
from itn.english.rules.punctuation import Punctuation
from itn.english.rules.telephone import Telephone
from itn.english.rules.time import Time
from itn.english.rules.whitelist import Whitelist
from itn.english.rules.word import Word
from tn.processor import Processor


Expand All @@ -34,23 +44,53 @@ def build_tagger_and_verbalizer(self):
cardinal = Cardinal()
ordinal = Ordinal(cardinal=cardinal)
decimal = Decimal(cardinal=cardinal)
date = Date(cardinal=cardinal, ordinal=ordinal)
time = Time(cardinal=cardinal)
measure = Measure(cardinal=cardinal, decimal=decimal)
money = Money(cardinal=cardinal, decimal=decimal)
telephone = Telephone(cardinal=cardinal)
electronic = Electronic()
whitelist = Whitelist()
word = Word()
char = Char()
punctuation = Punctuation()

tagger = (
add_weight(ordinal.tagger, 1.0)
| add_weight(decimal.tagger, 1.01)
| add_weight(cardinal.tagger, 1.02)
classify = (
add_weight(date.tagger, 1.09)
| add_weight(time.tagger, 1.1)
| add_weight(measure.tagger, 1.1)
| add_weight(money.tagger, 1.08)
| add_weight(whitelist.tagger, 1.01)
| add_weight(telephone.tagger, 1.1)
| add_weight(electronic.tagger, 1.1)
| add_weight(ordinal.tagger, 1.09)
| add_weight(decimal.tagger, 1.1)
| add_weight(cardinal.tagger, 1.1)
| add_weight(word.tagger, 50)
| add_weight(char.tagger, 100)
).optimize()

tagger = tagger.star
self.tagger = tagger @ self.build_rule(delete(" "), "", "[EOS]")
punct = add_weight(punctuation.tagger, 1.1)
token = closure(punct + delete(" ").ques) + classify + closure(delete(" ").ques + punct)
graph = token + closure(self.DELETE_EXTRA_SPACE + token)
self.tagger = delete(" ").star + graph + delete(" ").star

verbalizer = (
cardinal.verbalizer
| ordinal.verbalizer
| decimal.verbalizer
| date.verbalizer
| time.verbalizer
| measure.verbalizer
| money.verbalizer
| telephone.verbalizer
| electronic.verbalizer
| whitelist.verbalizer
| word.verbalizer
| char.verbalizer
| punctuation.verbalizer
).optimize()

self.verbalizer = verbalizer.star
self.verbalizer = (verbalizer + self.INSERT_SPACE).star @ self.build_rule(
self.DELETE_EXTRA_SPACE
) @ self.build_rule(delete(" "), r="[EOS]")
17 changes: 13 additions & 4 deletions itn/english/rules/cardinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from pynini import closure, cross, string_file, union
from pynini.lib.pynutil import delete, insert
from pynini import closure, cross, difference, string_file, union
from pynini.lib.pynutil import add_weight, delete, insert

from tn.processor import Processor
from tn.utils import get_abs_path
Expand All @@ -35,7 +35,8 @@ def build_tagger(self):

# 1~9, 10~19, 20~99
one_digit = digit
two_digit = teen | (ties + (ds + digit | insert("0")))
two_digit = teen | (ties + (ds + digit | add_weight(insert("0"), 0.1)))
self.graph_two_digit = two_digit
up_to_99 = one_digit | two_digit

# one hundred, one hundred twenty three, one hundred one
Expand All @@ -47,6 +48,7 @@ def build_tagger(self):

# 1~999
up_to_999 = up_to_99 | hundreds
self.up_to_999 = up_to_999
# 1~999 with zero-padding to 3 digits
up_to_999_padded = hundreds | insert("0") + two_digit | insert("00") + one_digit

Expand Down Expand Up @@ -111,10 +113,17 @@ def _with_mag_padded(name):
graph = (delete_and @ graph).optimize()

self.graph = graph
self.graph_no_exception = graph

# exclude 0-12 from cardinal tagger (they stay as words)
from itn.english.rules.time import _num_to_word
exception_labels = ["zero"] + [_num_to_word(x) for x in range(1, 13)]
exception = union(*exception_labels).optimize()
graph_with_exception = (difference(self.VSIGMA, exception) @ graph).optimize()

minus = delete("minus") | delete("negative")
optional_minus = closure(insert('negative: "-" ') + minus + ds, 0, 1)
final_graph = optional_minus + insert('integer: "') + graph + insert('"')
final_graph = optional_minus + insert('integer: "') + graph_with_exception + insert('"')
self.tagger = self.add_tokens(final_graph)

def build_verbalizer(self):
Expand Down
33 changes: 30 additions & 3 deletions itn/english/rules/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ def build_tagger(self):

# Year as two groups of two digits: "twenty twelve" => 2012
year_two_parts = (teen | two_digit) + ds + (two_digit | oh_digit | teen)
# 3-digit year: "seven fifty" => 750
year_three_digit = digit + ds + (two_digit | oh_digit | teen)

# Year as "X thousand Y": "two thousand twelve" => 2012
# Need zero-padded variants so "two thousand three" => 2003
Expand Down Expand Up @@ -116,9 +118,32 @@ def build_tagger(self):
+ po
)
# Year only => "twenty twelve", "two thousand three"
graph_y = add_weight(year, 0.01) + po
graph_y = year + po

final_graph = graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y
# Decades: "nineteen eighties" => 1980s
decade_suffix = closure(self.ALPHA, 1) + (cross("ies", "y") | delete("s"))
decade_word = pynini.compose(decade_suffix, ties | cross("ten", "10"))
graph_decade = (
insert('year: "') + (teen | two_digit) + ds + decade_word + insert('0s"') + po
)

# Quarter: "second quarter of twenty twenty two" => Q2 2022
quarter_num = (
cross("first", "1") | cross("second", "2")
| cross("third", "3") | cross("fourth", "4")
)
graph_quarter = (
insert('day: "Q') + quarter_num + insert('"')
+ ds + delete("quarter") + ds + delete("of") + ds
+ insert(' year: "') + year_graph + insert('"') + po
)

# BC/AD suffix
bc_ad = ds + (cross("b c", "BC") | cross("a d", "AD"))
year_graph_with_3digit = year_graph | year_three_digit
graph_y_bc = insert('year: "') + year_graph_with_3digit + bc_ad + insert('"') + po

final_graph = graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y | graph_decade | graph_quarter | graph_y_bc
self.tagger = self.add_tokens(final_graph)

def build_verbalizer(self):
Expand Down Expand Up @@ -160,6 +185,8 @@ def build_verbalizer(self):
graph_dmy = day + self.DELETE_SPACE + insert(" ") + month + optional_year
# year only
graph_y = year
# day + year (for quarter: Q2 2022)
graph_dy = day + self.DELETE_SPACE + insert(" ") + year

graph = (graph_mdy | graph_dmy | graph_y) + self.DELETE_SPACE + delete_po
graph = (graph_mdy | graph_dmy | graph_dy | graph_y) + self.DELETE_SPACE + delete_po
self.verbalizer = self.delete_tokens(graph)
30 changes: 27 additions & 3 deletions itn/english/rules/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from pynini import closure, cross, string_file
from pynini import closure, cross, string_file, union
from pynini.lib.pynutil import delete, insert

from itn.english.rules.cardinal import Cardinal
from tn.processor import Processor
from tn.utils import get_abs_path
from tn.utils import get_abs_path, load_labels


class Decimal(Processor):
Expand Down Expand Up @@ -45,6 +45,25 @@ def build_tagger(self):
point = delete("point")

graph = optional_negative + closure(integer_part + ds, 0, 1) + point + ds + frac_part

# quantity: "five point two million" => 5.2 million
quantities = load_labels(get_abs_path("../itn/english/data/numbers/thousands.tsv"))
quantity_all = union(*[x[0] for x in quantities])
quantity_no_thousand = union(*[x[0] for x in quantities if x[0] != "thousand"])
# decimal + quantity: five point two million, 164.58 thousand
quantity_graph = (
optional_negative + integer_part + ds + point + ds + frac_part
+ ds + insert(' quantity: "') + quantity_all + insert('"')
)
# cardinal (up to 999) + quantity: four hundred million, five million
# exclude thousand to let cardinal handle "ten thousand" => 10000
cardinal_small = self.cardinal.up_to_999
cardinal_quantity = (
optional_negative + insert('integer_part: "') + cardinal_small + insert('"')
+ ds + insert(' quantity: "') + quantity_no_thousand + insert('"')
)
graph |= quantity_graph | cardinal_quantity

self.tagger = self.add_tokens(graph)

def build_verbalizer(self):
Expand All @@ -56,6 +75,11 @@ def build_verbalizer(self):
+ delete('"') + self.NOT_QUOTE.plus + delete('"')
)
optional_fractional = closure(fractional + self.DELETE_SPACE, 0, 1)
graph = optional_sign + optional_integer + optional_fractional
quantity = (
insert(" ") + delete('quantity:') + self.DELETE_SPACE
+ delete('"') + self.NOT_QUOTE.plus + delete('"')
)
optional_quantity = closure(quantity + self.DELETE_SPACE, 0, 1)
graph = optional_sign + optional_integer + optional_fractional + optional_quantity
self.numbers = graph
self.verbalizer = self.delete_tokens(graph)
97 changes: 28 additions & 69 deletions itn/english/rules/electronic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from pynini import closure, cross, invert, string_file
from pynini import accep, closure, cross, difference, invert, string_file
from pynini.lib.pynutil import add_weight, delete, insert

from tn.processor import Processor
Expand All @@ -28,90 +28,49 @@ def __init__(self):

def build_tagger(self):
ds = delete(" ")

# Single characters: digits and letters
digit = string_file(get_abs_path("../itn/english/data/numbers/digit.tsv"))
zero = string_file(get_abs_path("../itn/english/data/numbers/zero.tsv"))
alpha_or_digit = self.ALPHA | digit | zero

# Symbols from TSV (symbol\tname): invert to get name -> symbol
symbols = invert(
string_file(get_abs_path("../itn/english/data/electronic/symbols.tsv"))
)
symbols = invert(string_file(get_abs_path("../itn/english/data/electronic/symbols.tsv")))

# A "token" is either a single char (letter/digit/symbol) or a
# multi-letter word kept verbatim (e.g. "gmail", "nvidia").
# Multi-letter words have lower priority so spelled-out letters are preferred.
word = add_weight(closure(self.ALPHA, 2), 0.01)
token = alpha_or_digit | symbols | word
char = self.ALPHA | digit | zero
word = add_weight(closure(self.ALPHA, 2), 0.1)
token = char | symbols | word
first_token = char | difference(word, accep("dot"))
component = first_token + closure(ds + token)

# A component is one or more tokens separated by spaces
component = token + closure(ds + token)
dot = cross("dot", ".")
domain = component + (ds + dot + ds + component).plus

username = insert('username: "') + component + insert('"')
domain_field = insert('domain: "') + domain + insert('"')

# Domain: component(s) separated by "dot" => "."
dot = cross("dot", ".")
domain_content = component + closure(ds + dot + ds + component)
domain = insert('domain: "') + domain_content + insert('"')

# Email: username at domain
graph_email = (
username
+ ds
+ delete("at")
+ ds
+ insert(" ")
+ domain
)

# URL protocol: "h t t p colon slash slash" or "h t t p s colon slash slash"
# Email: X at Y dot Z (requires "at" keyword)
graph_email = username + ds + delete("at") + ds + insert(" ") + domain_field

# URL: requires protocol or www prefix
http = cross("h t t p", "http")
https = cross("h t t p s", "https")
colon_slash_slash = cross(" colon slash slash ", "://")
protocol_start = (http | https) + colon_slash_slash

# www prefix
protocol = (http | https) + cross(" colon slash slash ", "://")
www = cross("w w w", "www")

# URL: [protocol] [www.] domain
url_content = (
closure(protocol_start, 0, 1)
+ closure(www + ds + dot + ds, 0, 1)
+ domain_content
)
graph_url = insert('protocol: "') + url_content + insert('"')
# protocol + [www.] + domain
url_with_protocol = protocol + closure(www + ds + dot + ds, 0, 1) + domain
# www. + domain (no protocol)
url_with_www = www + ds + dot + ds + domain
# domain only (must have dot): nvidia dot com
url_domain_only = domain

graph_url = insert('protocol: "') + (url_with_protocol | url_with_www | url_domain_only) + insert('"')

final_graph = graph_email | graph_url
self.tagger = self.add_tokens(final_graph)

def build_verbalizer(self):
username = (
delete("username:")
+ self.DELETE_SPACE
+ delete('"')
+ self.NOT_QUOTE.plus
+ delete('"')
)
domain = (
delete("domain:")
+ self.DELETE_SPACE
+ delete('"')
+ self.NOT_QUOTE.plus
+ delete('"')
)
protocol = (
delete("protocol:")
+ self.DELETE_SPACE
+ delete('"')
+ self.NOT_QUOTE.plus
+ delete('"')
)

# Email: username@domain
username = delete("username:") + self.DELETE_SPACE + delete('"') + self.NOT_QUOTE.plus + delete('"')
domain = delete("domain:") + self.DELETE_SPACE + delete('"') + self.NOT_QUOTE.plus + delete('"')
protocol = delete("protocol:") + self.DELETE_SPACE + delete('"') + self.NOT_QUOTE.plus + delete('"')

graph_email = username + self.DELETE_SPACE + insert("@") + domain
# URL: just output the protocol content directly
graph_url = protocol

graph = graph_email | graph_url
self.verbalizer = self.delete_tokens(graph)
self.verbalizer = self.delete_tokens(graph_email | graph_url)
Loading
Loading