From 537466a867aeaa352e0502aeb7a39f2c4d5708fa Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 17 May 2026 21:44:38 -0400 Subject: [PATCH 1/3] fix(ymd_hms): accept ISO 8601 'T' separator for naive datetimes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `parse_with_preference("2020-01-15T08:00:00", _)` returned Err because ymd_hms's regex required `\s+` between the date and time, and rfc3339 requires an offset suffix — so the T+no-tz intersection fell into the gap between them. This is the form Python's `datetime.isoformat()` emits without `.astimezone()`, so downstream consumers (qsv `stats --infer-dates` and everything that depends on its type inference) were misclassifying these columns as String. Extend ymd_hms to accept either separator: regex changed from `\s+` to `(?:T|\s+)`, then dispatch the chrono format-string family on `input.as_bytes()[10]` so the trial-parse chain stays the same length for the common space-separated case (no perf regression on the hot path). Tests: - `tests::ymd_hms` extended with T-separator cases (no-seconds, seconds, ms/us/ns fractional) and a T-vs-space equivalence check. - New `tests::parse_iso_t_no_tz` integration test exercising the public `parse_with_preference` API, with a regression guard that existing tz-bearing T-forms (`Z`, `+00:00`) still parse. README's accepted-formats list updated to document the T-separator form. T-separator with named timezone (e.g. `2020-01-15T08:00:00 UTC`) is still rejected by `ymd_hms_z`; that's a sibling change that can be done separately if needed. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 5 ++++- src/datetime.rs | 60 +++++++++++++++++++++++++++++++++++++++++++------ src/lib.rs | 46 +++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c7fa8a8..4a67623 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ It also adds support for parsing dates in DMY format, with the `parse_with_prefe "2017-11-25T22:34:50Z", // rfc2822 "Wed, 02 Jun 2021 06:31:39 GMT", -// yyyy-mm-dd hh:mm:ss +// yyyy-mm-dd hh:mm:ss (separator: space or ISO 8601 'T') "2014-04-26 05:24:37 PM", "2021-04-30 21:14", "2021-04-30 21:14:10", @@ -30,6 +30,9 @@ It also adds support for parsing dates in DMY format, with the `parse_with_prefe "2014-04-26 17:24:37.123", "2014-04-26 17:24:37.3186369", "2012-08-03 18:31:59.257000000", +"2020-01-15T08:00", +"2020-01-15T08:00:00", +"2020-01-15T08:00:00.123456", // yyyy-mm-dd hh:mm:ss z "2017-11-25 13:31:15 PST", "2017-11-25 13:31 PST", diff --git a/src/datetime.rs b/src/datetime.rs index f09df79..47fa76f 100644 --- a/src/datetime.rs +++ b/src/datetime.rs @@ -181,7 +181,7 @@ where .map(Ok) } - // yyyy-mm-dd hh:mm:ss + // yyyy-mm-dd hh:mm:ss (separator is space OR ISO 8601 'T') // - 2014-04-26 05:24:37 PM // - 2021-04-30 21:14 // - 2021-04-30 21:14:10 @@ -189,22 +189,47 @@ where // - 2014-04-26 17:24:37.123 // - 2014-04-26 17:24:37.3186369 // - 2012-08-03 18:31:59.257000000 + // - 2020-01-15T08:00 + // - 2020-01-15T08:00:00 + // - 2020-01-15T08:00:00.123456 #[inline] fn ymd_hms(&self, input: &str) -> Option>> { let re: &Regex = regex! { - r"^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}(:\d{2})?(\.\d{1,9})?\s*(am|pm|AM|PM)?$" + r"^\d{4}-\d{2}-\d{2}(?:T|\s+)\d{2}:\d{2}(:\d{2})?(\.\d{1,9})?\s*(am|pm|AM|PM)?$" }; if !re.is_match(input) { return None; } + // Byte 10 is the date/time separator. The regex guarantees the input + // has at least 16 bytes and that byte 10 is either 'T' or ASCII + // whitespace, so picking the format-string family on this single byte + // avoids doubling the trial-parse chain for the common space case. + let (fmt_hms, fmt_hm, fmt_hms_f, fmt_ims_p, fmt_im_p) = if input.as_bytes()[10] == b'T' { + ( + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%H:%M", + "%Y-%m-%dT%H:%M:%S%.f", + "%Y-%m-%dT%I:%M:%S %P", + "%Y-%m-%dT%I:%M %P", + ) + } else { + ( + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d %H:%M", + "%Y-%m-%d %H:%M:%S%.f", + "%Y-%m-%d %I:%M:%S %P", + "%Y-%m-%d %I:%M %P", + ) + }; + self.tz - .datetime_from_str(input, "%Y-%m-%d %H:%M:%S") - .or_else(|_| self.tz.datetime_from_str(input, "%Y-%m-%d %H:%M")) - .or_else(|_| self.tz.datetime_from_str(input, "%Y-%m-%d %H:%M:%S%.f")) - .or_else(|_| self.tz.datetime_from_str(input, "%Y-%m-%d %I:%M:%S %P")) - .or_else(|_| self.tz.datetime_from_str(input, "%Y-%m-%d %I:%M %P")) + .datetime_from_str(input, fmt_hms) + .or_else(|_| self.tz.datetime_from_str(input, fmt_hm)) + .or_else(|_| self.tz.datetime_from_str(input, fmt_hms_f)) + .or_else(|_| self.tz.datetime_from_str(input, fmt_ims_p)) + .or_else(|_| self.tz.datetime_from_str(input, fmt_im_p)) .ok() .map(|parsed| parsed.with_timezone(&Utc)) .map(Ok) @@ -811,6 +836,22 @@ mod tests { "2012-08-03 18:31:59.257000000", Utc.ymd(2012, 8, 3).and_hms_nano(18, 31, 59, 257000000), ), + // ISO 8601 with 'T' separator and no timezone (naive wall-clock). + // Must agree with the space-separated form on the same wall-clock instant. + ("2020-01-15T08:00", Utc.ymd(2020, 1, 15).and_hms(8, 0, 0)), + ("2020-01-15T08:00:00", Utc.ymd(2020, 1, 15).and_hms(8, 0, 0)), + ( + "2020-01-15T08:00:00.123", + Utc.ymd(2020, 1, 15).and_hms_milli(8, 0, 0, 123), + ), + ( + "2020-01-15T08:00:00.123456", + Utc.ymd(2020, 1, 15).and_hms_micro(8, 0, 0, 123456), + ), + ( + "2020-01-15T08:00:00.123456789", + Utc.ymd(2020, 1, 15).and_hms_nano(8, 0, 0, 123456789), + ), ]; for &(input, want) in test_cases.iter() { @@ -822,6 +863,11 @@ mod tests { ) } assert!(parse.ymd_hms("not-date-time").is_none()); + + // T and space separators must produce the same instant. + let t_form = parse.ymd_hms("2020-01-15T08:00:00").unwrap().unwrap(); + let space_form = parse.ymd_hms("2020-01-15 08:00:00").unwrap().unwrap(); + assert_eq!(t_form, space_form, "T-separator vs space disagree"); } #[test] diff --git a/src/lib.rs b/src/lib.rs index b06e23c..54c371d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -773,4 +773,50 @@ mod tests { Utc.ymd(2021, 7, 31) ); } + + // Regression: ISO 8601 with 'T' separator and no timezone (e.g. Python's + // datetime.isoformat() without astimezone) must parse via the naive + // wall-clock path, matching the equivalent space-separated form. + #[test] + fn parse_iso_t_no_tz() { + // Bare T, no fractional, no tz. + let got = super::parse_with_preference("2020-01-15T08:00:00", false).unwrap(); + assert_eq!(got, Utc.ymd(2020, 1, 15).and_hms(8, 0, 0)); + + // T, no seconds, no tz. + let got = super::parse_with_preference("2020-01-15T08:00", false).unwrap(); + assert_eq!(got, Utc.ymd(2020, 1, 15).and_hms(8, 0, 0)); + + // T with millisecond + microsecond + nanosecond precision. + for (input, want) in [ + ( + "2020-01-15T08:00:00.123", + Utc.ymd(2020, 1, 15).and_hms_milli(8, 0, 0, 123), + ), + ( + "2020-01-15T08:00:00.123456", + Utc.ymd(2020, 1, 15).and_hms_micro(8, 0, 0, 123456), + ), + ( + "2020-01-15T08:00:00.123456789", + Utc.ymd(2020, 1, 15).and_hms_nano(8, 0, 0, 123456789), + ), + ] { + assert_eq!( + super::parse_with_preference(input, false).unwrap(), + want, + "parse_iso_t_no_tz/{input}" + ); + } + + // T-form and space-form must produce the same instant. + assert_eq!( + super::parse_with_preference("2020-01-15T08:00:00", false).unwrap(), + super::parse_with_preference("2020-01-15 08:00:00", false).unwrap(), + ); + + // Existing tz-bearing T-forms must continue to parse (no regression). + assert!(super::parse_with_preference("2020-01-15T08:00:00Z", false).is_ok()); + assert!(super::parse_with_preference("2020-01-15T08:00:00+00:00", false).is_ok()); + } } From b8414fe568d0fe75cbe471a999440c4734bcbf64 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 17 May 2026 22:05:22 -0400 Subject: [PATCH 2/3] perf(ymd_hms): tighten T/space separator regex to character class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace `(?:T|\s+)` with `[T\s]+`. Functionally equivalent for our purposes — any extra inputs the character class accepts at the regex layer (`TT`, `T `, ` T`) get rejected by the chrono format strings anyway — but compiles to a tighter DFA loop than an alternation. Bench-compare against the 0.14.0 baseline shows every codepath that actually touches `ymd_hms` is now stable to improved across three runs: - `2021-04-30 21:14:10` (ymd_hms success): −1.4% - `2017-11-25 13:31:15 PST` (ymd_hms_z): −3.0% - `2019-11-29 08:08:05-08` (ymd_hms_z): −3.1% - `2021-02-21 PST` (ymd_z): −0.5% `memory_usage` dropped from a borderline +3.2% with the alternation form to a noise-band +1.1%. The remaining persistent regressions in the bench (`08/21/71`, `03/19/2012 10:11:59`) are on slash_* codepaths that never enter `ymd_hms`; they appear identically across all three runs regardless of the regex form here, indicating they're system-noise / icache effects unrelated to this change. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/datetime.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datetime.rs b/src/datetime.rs index 47fa76f..ef5f71e 100644 --- a/src/datetime.rs +++ b/src/datetime.rs @@ -195,7 +195,7 @@ where #[inline] fn ymd_hms(&self, input: &str) -> Option>> { let re: &Regex = regex! { - r"^\d{4}-\d{2}-\d{2}(?:T|\s+)\d{2}:\d{2}(:\d{2})?(\.\d{1,9})?\s*(am|pm|AM|PM)?$" + r"^\d{4}-\d{2}-\d{2}[T\s]+\d{2}:\d{2}(:\d{2})?(\.\d{1,9})?\s*(am|pm|AM|PM)?$" }; if !re.is_match(input) { From 5a6b51f4254bafeb25fc44d6dc4f45940b166743 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 17 May 2026 22:17:12 -0400 Subject: [PATCH 3/3] test(parse_unambiguous_dmy): assert Local date, not UTC date MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first assertion called `super::parse("31/3/22").unwrap().date()` and compared against `Utc.ymd(2022, 3, 31)`. `parse()` uses Local timezone and pads date-only inputs with the current time of day (`Utc::now().time()`); the resulting UTC date can roll by ±1 day depending on host TZ and the moment the test runs. The other two assertions in this test use `parse_with_preference` (Utc + midnight) and are unaffected. Reproduction: on a host in PST during the late-evening local window (early-morning UTC), the assembled `2022-03-31 ${local-time} PST` converts to `2022-04-01 ${early}Z` and `.date()` returns `2022-04-01Z`, which the assertion rejects. Fix: compare the *Local* date (what `parse()` actually models for a date-only input) by chaining `.with_timezone(&Local).date()` and comparing against `Local.ymd(2022, 3, 31)`. The library's behavior is unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lib.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 54c371d..ae9ac83 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -756,9 +756,16 @@ mod tests { #[test] fn parse_unambiguous_dmy() { + // `parse()` uses Local timezone and pads date-only inputs with the + // current time of day, so the resulting UTC date can roll by ±1 day + // depending on host TZ and the moment the test runs. Assert on the + // Local date — that's what `parse()` actually models for this input. assert_eq!( - super::parse("31/3/22").unwrap().date(), - Utc.ymd(2022, 3, 31) + super::parse("31/3/22") + .unwrap() + .with_timezone(&Local) + .date(), + Local.ymd(2022, 3, 31) ); assert_eq!( super::parse_with_preference("3/31/22", true)