Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions itn/english/data/time/to_hour.tsv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
one 12
two 1
three 2
four 3
five 4
six 5
seven 6
eight 7
nine 8
ten 9
two 01
three 02
four 03
five 04
six 05
seven 06
eight 07
nine 08
ten 09
eleven 10
twelve 11
36 changes: 32 additions & 4 deletions itn/english/rules/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,10 @@ def build_tagger(self):

# Year as hundreds: "nineteen oh five" => 1905
year_hundreds = (teen | two_digit) + ds + oh_digit
# Year as "X hundred": "nineteen hundred" => 1900
year_xx_hundred = (teen | two_digit) + ds + delete("hundred") + insert("00")

year_graph = year_two_parts | year_thousands | year_hundreds
year_graph = year_two_parts | year_thousands | year_hundreds | year_xx_hundred

# Delete optional "and" within year
delete_and = self.build_rule(delete("and "), " ", self.ALPHA)
Expand Down Expand Up @@ -138,12 +140,38 @@ def build_tagger(self):
+ insert(' year: "') + year_graph + insert('"') + po
)

# BC/AD suffix
bc_ad = ds + (cross("b c", "BC") | cross("a d", "AD"))
# BC/AD/BCE/CE suffix
bc_ad = ds + (
cross("b c e", "BCE") | cross("before common era", "BCE")
| cross("b c", "BC")
| cross("c e", "CE") | cross("common era", "CE")
| cross("a d", "AD")
)
year_graph_with_3digit = year_graph | year_three_digit
graph_y_bc = insert('year: "') + year_graph_with_3digit + bc_ad + insert('"') + po

final_graph = graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y | graph_decade | graph_quarter | graph_y_bc
# Half: "first half of twenty twenty two" => H1 2022
half_num = cross("first", "1") | cross("second", "2")
graph_half = (
insert('day: "H') + half_num + insert('"')
+ ds + delete("half") + ds + delete("of") + ds
+ insert(' year: "') + year_graph + insert('"') + po
)

# Century: "nineteen hundreds" => 1900s
graph_century = (
insert('year: "') + (teen | two_digit) + ds + cross("hundreds", "00s") + insert('"') + po
)
# Millennium: "two thousands" => 2000s
graph_millennium = (
insert('year: "') + cross("two", "2") + ds + cross("thousands", "000s") + insert('"') + po
)

final_graph = (
graph_mdy | graph_md | graph_my | graph_dmy | graph_dm | graph_y
| graph_decade | graph_quarter | graph_half | graph_y_bc
| graph_century | graph_millennium
)
self.tagger = self.add_tokens(final_graph)

def build_verbalizer(self):
Expand Down
42 changes: 29 additions & 13 deletions itn/english/rules/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import pynini
from pynini import closure, cross, invert, string_file
from pynini.lib.pynutil import delete, insert
from pynini.lib.pynutil import add_weight, delete, insert

from itn.english.rules.cardinal import Cardinal
from itn.english.rules.decimal import Decimal
Expand All @@ -35,24 +35,40 @@ def build_tagger(self):
ds = delete(" ")

# Load measurements: symbol\tname, invert to get name -> symbol
units_graph = invert(
string_file(get_abs_path("../itn/english/data/measurements.tsv"))
)
tsv_path = get_abs_path("../itn/english/data/measurements.tsv")
units_graph = invert(string_file(tsv_path))

# Handle plurals: generate plural->symbol mappings from the singular TSV entries
# Uses finite string_map instead of cdrewrite to avoid slow runtime compose
singular_names = {}
with open(tsv_path, encoding="utf-8") as f:
for line in f:
parts = line.strip().split("\t", 1)
if len(parts) == 2:
singular_names.setdefault(parts[1], parts[0])

plural_pairs = []
irregular_plurals = {
"foot": "feet", "inch": "inches",
"ounce": "ounces",
}
for name, symbol in singular_names.items():
if name in irregular_plurals:
plural_pairs.append((irregular_plurals[name], symbol))
elif name.endswith(("s", "z", "sh", "ch", "x")):
plural_pairs.append((name + "es", symbol))
elif name.endswith("y") and len(name) > 1 and name[-2] not in "aeiou":
plural_pairs.append((name[:-1] + "ies", symbol))
else:
plural_pairs.append((name + "s", symbol))

# Handle plurals: strip trailing "s" to match singular form
# e.g. "meters" -> "meter" -> "m", "kilograms" -> "kilogram" -> "kg"
depluralize = pynini.cdrewrite(
cross("s", ""), "", "[EOS]", self.VSIGMA
)
# Handle irregular plurals: "feet" -> "foot"
irregular = pynini.string_map([("feet", "foot")])
unit_singular = units_graph
unit_plural = (depluralize | irregular) @ units_graph
unit_plural = pynini.string_map(plural_pairs)

unit = unit_singular | unit_plural

# Handle "per" units: "per hour" -> "/h"
per_unit = insert("/") + delete("per") + ds + unit_singular
per_unit = add_weight(insert("/") + delete("per") + ds + unit_singular, 1)
full_unit = unit + closure(ds + per_unit, 0, 1) | per_unit

# Cardinal value
Expand Down
11 changes: 9 additions & 2 deletions itn/english/rules/telephone.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,17 @@ def build_tagger(self):
("five","5"),("six","6"),("seven","7"),("eight","8"),
("nine","9"),("zero","0"),("oh","0"),("o","0")]])

# "triple X" => XXX
triple = union(*[cross(f"triple {w}", f"{d}{d}{d}")
for w, d in [("one","1"),("two","2"),("three","3"),("four","4"),
("five","5"),("six","6"),("seven","7"),("eight","8"),
("nine","9"),("zero","0"),("oh","0"),("o","0")]])

# two-digit cardinal: twenty three => 23 (uses graph_two_digit for proper space handling)
two_digit = self.cardinal.graph_two_digit

# a token is 1 or 2 digits
token = single | double | add_weight(two_digit, 0.002)
# a token is 1, 2, or 3 digits
token = single | double | triple | add_weight(two_digit, 0.002)

# sequence of tokens separated by spaces
seq = token + closure(ds + token)
Expand Down Expand Up @@ -75,6 +81,7 @@ def build_tagger(self):
ip_token = (
single + closure(ds + single, 0, 2)
| double
| triple
| add_weight(two_digit, 0.002)
| single + ds + two_digit
| two_digit + ds + single
Expand Down
18 changes: 11 additions & 7 deletions itn/english/rules/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from pynini import closure, cross, invert, string_file, union
from pynini.lib.pynutil import add_weight, delete, insert

TO_OR_TILL = union("to", "till")

from itn.english.rules.cardinal import Cardinal
from tn.processor import Processor
from tn.utils import get_abs_path
Expand Down Expand Up @@ -56,7 +58,7 @@ def build_tagger(self):
min_single_raw = union(*[cross(_num_to_word(x), str(x)) for x in range(1, 10)])
min_double_raw = graph_min_double # already no padding

oclock = cross("o'clock", "") | cross("oclock", "") | cross("hundred hours", "")
oclock = cross("o'clock", "") | cross("o' clock", "") | cross("o clock", "") | cross("oclock", "") | cross("hundred hours", "")

hour = insert('hour: "') + hour_all + insert('"')
hour12 = insert('hour: "') + hour_12 + insert('"')
Expand All @@ -80,23 +82,25 @@ def build_tagger(self):
graph_o_min_suffix = (
hour + ds + insert(' minute: "') + delete("o") + ds + graph_min_single + insert('"') + suffix + zone_opt
)
# "half past two", "quarter past two"
# "half past two", "quarter past two", "ten past four"
graph_past = (
insert('minute: "') + graph_min_verbose + insert('"') + ds + delete("past") + ds + hour
insert('minute: "')
+ (graph_min_single | graph_min_double | graph_min_verbose)
+ insert('"') + ds + delete("past") + ds + hour
)
# "quarter to one" => 12:45
# "quarter to one" / "quarter till one" => 12:45
graph_quarter_to = (
insert('minute: "') + cross("quarter", "45") + insert('"')
+ ds + delete("to") + ds
+ ds + delete(TO_OR_TILL) + ds
+ insert('hour: "') + to_hour + insert('"')
)
# "ten to eleven pm" => 10:50 p.m.
# "ten to eleven pm" / "ten till eleven pm" => 10:50 p.m.
graph_min_to = (
insert('minute: "')
+ ((min_single_raw | min_double_raw) @ minute_to)
+ insert('"')
+ closure(ds + delete("min") + delete("ute").ques + delete("s").ques, 0, 1)
+ ds + delete("to") + ds
+ ds + delete(TO_OR_TILL) + ds
+ insert('hour: "') + to_hour + insert('"')
+ suffix
)
Expand Down
8 changes: 8 additions & 0 deletions itn/english/test/data/en_date.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,11 @@ nineteen seventy five => 1975
eleven fifty five => 1155
second quarter of twenty twenty two => Q2 2022
seven fifty b c => 750BC
seven fifty b c e => 750BCE
nineteen hundred c e => 1900CE
nineteen hundred a d => 1900AD
first half of twenty twenty two => H1 2022
second half of twenty twenty => H2 2020
nineteen hundreds => 1900s
twenty hundreds => 2000s
two thousands => 2000s
2 changes: 2 additions & 0 deletions itn/english/test/data/en_measure.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,5 @@ eight point five megawatts => 8.5 mW
eight point five meters => 8.5 m
eight point five two percent => 8.52 %
eight point four four percent => 8.44 %
five ounces => 5 oz
ten kilo calories => 10 kcal
1 change: 1 addition & 0 deletions itn/english/test/data/en_telephone.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ r t x forty fifty t i => RTX 4050ti
four three two double seven three two one four three two one four three double zero five => 432 7732 143214 3005
a thirty six => a 36
a ten eighty p display => a 1080p display
triple five one two three one two three four => 555-123-1234
7 changes: 7 additions & 0 deletions itn/english/test/data/en_time.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,10 @@ quarter to one => 12:45
quarter to twelve => 11:45
set alarm at ten to eleven pm => set alarm at 10:50 p.m.
one min to one am => 12:59 a.m.
ten past four => 04:10
twenty five past three => 03:25
five past twelve => 12:05
quarter till two => 01:45
ten till four pm => 03:50 p.m.
three o' clock => 03:00
three o clock => 03:00
Loading