From eeb083d3479806c53e0b5525f5dbbf64adb093e5 Mon Sep 17 00:00:00 2001 From: Celso Pinto Date: Tue, 14 Apr 2026 20:05:16 +0800 Subject: [PATCH] Populate sender email and recipients in threads output `hey threads --json` was returning empty `creator.email_address` and `recipients` for every entry because the HTML parser only captured the sender's display name and ignored the rest of the sender link. Scrape the sender email from the `` inside each sender anchor, and extract per-entry recipients by slicing the HTML between entry anchors and reusing the existing `fullRecipientsRe` + `extractEmails` helpers. Dedupe recipients by address so a repeat in the HTML doesn't produce duplicate contacts. Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/htmlutil/entries.go | 41 +++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/internal/htmlutil/entries.go b/internal/htmlutil/entries.go index 70b3a4b..d0001b0 100644 --- a/internal/htmlutil/entries.go +++ b/internal/htmlutil/entries.go @@ -12,6 +12,7 @@ import ( var ( entryBlockRe = regexp.MustCompile(`(?s)data-entry-id="(\d+)"`) senderRe = regexp.MustCompile(`id="sender_entry_(\d+)"[^>]*>\s*([^<]+?)\s*<`) + senderEmailRe = regexp.MustCompile(`(?s)sender_entry_(\d+).*?entry__sender-email[^>]*>]*>[^<]*([^<]+)<`) timeRe = regexp.MustCompile(`]*datetime="([^"]+)"`) srcdocRe = regexp.MustCompile(`(?s)srcdoc="([^"]*trix-content[^"]*)"`) fullRecipientsRe = regexp.MustCompile(`(?s)entry__full-recipients[^>]*>(.*?)`) @@ -86,6 +87,12 @@ func ParseTopicEntriesHTML(html string) []models.Entry { senders[m[1]] = m[2] } } + senderEmails := map[string]string{} + for _, m := range senderEmailRe.FindAllStringSubmatch(html, -1) { + if _, exists := senderEmails[m[1]]; !exists { + senderEmails[m[1]] = strings.TrimSpace(m[2]) + } + } // Associate times with entries by finding the first