From 37008b1dd6ee5b8d273afb7c6b85764bc1f0590a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 28 Feb 2026 07:32:53 +0000 Subject: [PATCH 1/8] Add schema.org microdata support to HtmlProvider (closes #611) - Parse itemscope/itemtype/itemprop HTML microdata attributes at design time - Generate a typed 'Schemas' container on HtmlProvider documents - Each schema type (e.g. http://schema.org/Person) becomes a property returning an array of typed items with one property per itemprop name - Items are erased to HtmlSchemaItem at runtime - Property values follow the HTML microdata spec: content attr, href, src, datetime, or inner text depending on element type - Nested itemscope elements are not traversed (correct per spec) - 6 unit tests (HtmlRuntime.getSchemas) + 3 integration tests (HtmlProvider) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- RELEASE_NOTES.md | 1 + .../Html/HtmlGenerator.fs | 57 +++++++ src/FSharp.Data.Html.Core/HtmlRuntime.fs | 154 +++++++++++++++++- .../HtmlRuntimeTypes.fs | 102 +++++++++++- tests/FSharp.Data.Tests/HtmlProvider.fs | 40 +++++ 5 files changed, 350 insertions(+), 4 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 334a65517..65d297cdf 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -2,6 +2,7 @@ ## 8.1.0-beta +- Add schema.org microdata support to `HtmlProvider`: when an HTML document contains elements with `itemscope`/`itemtype`/`itemprop` attributes, the provider now generates a typed `Schemas` container (e.g. `doc.Schemas.Person`) with one strongly-typed property per `itemprop` name discovered in the sample (closes #611) - Add `Http.ParseLinkHeader` utility for parsing RFC 5988 `Link` response headers (used by GitHub, GitLab, and other paginated APIs) into a `Map` from relation name to URL (closes #805) - Add `PreferDateTimeOffset` parameter to `CsvProvider`, `JsonProvider`, and `XmlProvider`: when true, date-time values without an explicit timezone offset are inferred as `DateTimeOffset` (using local offset) instead of `DateTime` (closes #1100, #1072) - Make `Http.AppendQueryToUrl` public (closes #1325) diff --git a/src/FSharp.Data.DesignTime/Html/HtmlGenerator.fs b/src/FSharp.Data.DesignTime/Html/HtmlGenerator.fs index f3362b694..436523161 100644 --- a/src/FSharp.Data.DesignTime/Html/HtmlGenerator.fs +++ b/src/FSharp.Data.DesignTime/Html/HtmlGenerator.fs @@ -301,6 +301,48 @@ module internal HtmlGenerator = definitionListType + let private createSchemaItemType getSchemaTypeName (schemaGroup: HtmlSchemaGroup) = + + // Each item is erased as HtmlSchemaItem, which carries the property map. + let itemType = + ProvidedTypeDefinition( + getSchemaTypeName schemaGroup.Name, + Some typeof, + hideObjectMethods = true, + nonNullable = true + ) + + itemType.AddXmlDoc(sprintf "Represents a single schema.org/%s microdata item." schemaGroup.Name) + + for propName in schemaGroup.Properties do + let keyCapture = propName + + let prop = + ProvidedProperty( + NameUtils.nicePascalName propName, + typeof, + getterCode = + (fun (Singleton item) -> + <@@ + (%%item: HtmlSchemaItem).Properties + |> Map.tryFind keyCapture + |> Option.defaultValue "" + @@>) + ) + + prop.AddXmlDoc(sprintf "Gets the '%s' itemprop value." propName) + itemType.AddMember prop + + itemType.AddMember( + ProvidedProperty( + "Html", + typeof, + getterCode = fun (Singleton item) -> <@@ (%%item: HtmlSchemaItem).Html @@> + ) + ) + + itemType + let generateTypes asm ns typeName parameters supportsNet6Types htmlObjects = let htmlType = @@ -379,4 +421,19 @@ module internal HtmlGenerator = getterCode = fun (Singleton doc) -> doc ) + | SchemaGroup schemaGroup -> + let containerType = getOrCreateContainer "Schemas" + let nameCapture = schemaGroup.Name + let itemType = createSchemaItemType getTypeName schemaGroup + + htmlType.AddMember itemType + + // Expose as a property returning an array of item types (erased as HtmlSchemaItem[]) + containerType.AddMember + <| ProvidedProperty( + getPropertyName schemaGroup.Name, + itemType.MakeArrayType(), + getterCode = fun (Singleton doc) -> <@@ (%%doc: HtmlDocument).GetSchema(nameCapture).Items @@> + ) + htmlType diff --git a/src/FSharp.Data.Html.Core/HtmlRuntime.fs b/src/FSharp.Data.Html.Core/HtmlRuntime.fs index 751cd46d8..efd006fef 100644 --- a/src/FSharp.Data.Html.Core/HtmlRuntime.fs +++ b/src/FSharp.Data.Html.Core/HtmlRuntime.fs @@ -109,17 +109,47 @@ type HtmlDefinitionList = sb.ToString() +/// Representation of a single schema.org microdata item (an element with itemscope/itemtype) +type HtmlSchemaItem = + { Properties: Map + Html: HtmlNode } + +/// Representation of a collection of schema.org microdata items sharing the same type URL +type HtmlSchemaGroup = + { + /// The local name from the schema type URL (e.g. "Person" from "http://schema.org/Person") + Name: string + /// The full schema type URL (e.g. "http://schema.org/Person") + TypeUrl: string + /// All item instances found in the document for this schema type + Items: HtmlSchemaItem[] + /// All property names discovered across items (union of keys), for type generation + Properties: string[] + } + + override x.ToString() = + let sb = System.Text.StringBuilder() + sb.AppendLine(sprintf "%s (%d items)" x.Name x.Items.Length) |> ignore + + for item in x.Items do + for (k, v) in item.Properties |> Map.toSeq do + sb.AppendLine(sprintf " %s = %s" k v) |> ignore + + sb.ToString() + /// Representation of an HTML table, list, or definition list type HtmlObjectDescription = | Table of HtmlTable | List of HtmlList | DefinitionList of HtmlDefinitionList + | SchemaGroup of HtmlSchemaGroup member x.Name = match x with | Table t -> t.Name | List l -> l.Name | DefinitionList dl -> dl.Name + | SchemaGroup sg -> sg.Name // -------------------------------------------------------------------------------------- @@ -396,6 +426,112 @@ module HtmlRuntime = Html = definitionList } |> Some + + let private hasAttr (name: string) (n: HtmlNode) = n.TryGetAttribute name |> Option.isSome + + /// Extract the property value for a schema.org microdata itemprop element. + /// Follows the HTML microdata specification: uses content attr, href, src, or inner text. + let private getMicrodataValue (node: HtmlNode) = + match node with + | HtmlElement("meta", _, _) -> + node.TryGetAttribute "content" + |> Option.map HtmlAttribute.value + |> Option.defaultValue "" + | HtmlElement("a", _, _) + | HtmlElement("link", _, _) -> + node.TryGetAttribute "href" + |> Option.map HtmlAttribute.value + |> Option.defaultWith (fun () -> node.InnerText()) + | HtmlElement("img", _, _) + | HtmlElement("audio", _, _) + | HtmlElement("video", _, _) + | HtmlElement("source", _, _) -> + node.TryGetAttribute "src" + |> Option.map HtmlAttribute.value + |> Option.defaultValue "" + | HtmlElement("time", _, _) -> + node.TryGetAttribute "datetime" + |> Option.map HtmlAttribute.value + |> Option.defaultWith (fun () -> normalizeWs (node.InnerText())) + | _ -> + node.TryGetAttribute "content" + |> Option.map HtmlAttribute.value + |> Option.defaultWith (fun () -> normalizeWs (node.InnerText())) + + /// Parse a single itemscope element into an HtmlSchemaItem. + /// Extracts all direct (non-nested) itemprop values. + let private parseSchemaItem (node: HtmlNode) : HtmlSchemaItem = + let rec collectProps acc (n: HtmlNode) : (string * string) list = + match n with + | HtmlElement(_, _, children) -> + let hasProp = n.TryGetAttribute "itemprop" |> Option.map HtmlAttribute.value + + let isNestedScope = hasAttr "itemscope" n && n <> node + + match hasProp with + | Some propName -> + let value = getMicrodataValue n + let acc' = (propName, value) :: acc + // Don't recurse into nested itemscope elements + if isNestedScope then + acc' + else + List.fold collectProps acc' children + | None -> + if isNestedScope then + acc + else + List.fold collectProps acc children + | _ -> acc + + let children = + match node with + | HtmlElement(_, _, cs) -> cs + | _ -> [] + + let props = List.fold collectProps [] children |> List.rev + + let propMap = + props + |> List.fold (fun (m: Map) (k, v) -> if m.ContainsKey k then m else m.Add(k, v)) Map.empty + + { Properties = propMap; Html = node } + + /// Get the local name from a schema type URL, e.g. "Person" from "http://schema.org/Person" + let private schemaTypeName (typeUrl: string) = + let last = typeUrl.TrimEnd('/').Split([| '/'; '#' |]) |> Array.last + + if String.IsNullOrWhiteSpace last then + "Schema" + else + NameUtils.nicePascalName last + + /// Extract all schema.org microdata groups from the document, + /// grouped by itemtype URL. + let getSchemas (doc: HtmlDocument) : HtmlSchemaGroup list = + let makeUnique = NameUtils.uniqueGenerator id + + doc.Descendants((fun n -> hasAttr "itemscope" n && hasAttr "itemtype" n), false) + |> Seq.toList + |> List.groupBy (fun n -> + n.TryGetAttribute "itemtype" + |> Option.map HtmlAttribute.value + |> Option.defaultValue "") + |> List.filter (fun (typeUrl, _) -> typeUrl <> "") + |> List.map (fun (typeUrl, nodes) -> + let name = makeUnique (schemaTypeName typeUrl) + let items = nodes |> List.map parseSchemaItem |> Array.ofList + + let allProps = + items + |> Array.collect (fun item -> item.Properties |> Map.toArray |> Array.map fst) + |> Array.distinct + + { Name = name + TypeUrl = typeUrl + Items = items + Properties = allProps }) + let internal getTables inferenceParameters includeLayoutTables (doc: HtmlDocument) = let tableElements = doc.DescendantsWithPath "table" |> List.ofSeq @@ -429,7 +565,8 @@ module HtmlRuntime = Seq.concat [ doc |> getTables inferenceParameters includeLayoutTables |> List.map Table doc |> getLists |> List.map List - doc |> getDefinitionLists |> List.map DefinitionList ] + doc |> getDefinitionLists |> List.map DefinitionList + doc |> getSchemas |> List.map SchemaGroup ] // -------------------------------------------------------------------------------------- @@ -442,7 +579,7 @@ open FSharp.Data open FSharp.Data.Runtime /// Underlying representation of the root types generated by HtmlProvider -type HtmlDocument internal (doc, tables, lists, definitionLists) = +type HtmlDocument internal (doc, tables, lists, definitionLists, schemas) = member _.Html = doc @@ -470,7 +607,10 @@ type HtmlDocument internal (doc, tables, lists, definitionLists) = |> List.map (fun e -> e.Name, e) |> Map.ofList - HtmlDocument(doc, tables, lists, definitionLists) + let schemas = + doc |> HtmlRuntime.getSchemas |> List.map (fun e -> e.Name, e) |> Map.ofList + + HtmlDocument(doc, tables, lists, definitionLists, schemas) /// [] @@ -496,6 +636,14 @@ type HtmlDocument internal (doc, tables, lists, definitionLists) = IsError = false)>] member _.GetDefinitionList(id: string) = definitionLists |> Map.find id + /// + [] + [] + member _.GetSchema(id: string) = schemas |> Map.find id + /// Underlying representation of table types generated by HtmlProvider type HtmlTable<'RowType> internal (name: string, headers: string[] option, values: 'RowType[], html: HtmlNode) = diff --git a/tests/FSharp.Data.Core.Tests/HtmlRuntimeTypes.fs b/tests/FSharp.Data.Core.Tests/HtmlRuntimeTypes.fs index e7262923f..4a7e907df 100644 --- a/tests/FSharp.Data.Core.Tests/HtmlRuntimeTypes.fs +++ b/tests/FSharp.Data.Core.Tests/HtmlRuntimeTypes.fs @@ -98,4 +98,104 @@ let ``HtmlDefinitionList record ToString should handle empty definitions`` () = result |> should contain name // Note: Generic HtmlList and HtmlTable tests are complex due to internal type usage -// The record types (HtmlList and HtmlDefinitionList) above provide good coverage for the 0% areas \ No newline at end of file +// The record types (HtmlList and HtmlDefinitionList) above provide good coverage for the 0% areas +// ============================================ +// Schema.org Microdata Tests +// ============================================ + +[] +let ``getSchemas returns empty list when no microdata present`` () = + let doc = HtmlDocument.Parse "

No microdata here.

" + let schemas = HtmlRuntime.getSchemas doc + schemas |> List.length |> should equal 0 + +[] +let ``getSchemas finds itemscope items and groups by itemtype`` () = + let html = """ + +
+ Jane Doe + Engineer +
+""" + + let doc = HtmlDocument.Parse html + let schemas = HtmlRuntime.getSchemas doc + schemas |> should haveLength 1 + let group = schemas.[0] + group.TypeUrl |> should equal "http://schema.org/Person" + group.Name |> should equal "Person" + group.Items |> should haveLength 1 + group.Items.[0].Properties |> Map.find "name" |> should equal "Jane Doe" + group.Items.[0].Properties |> Map.find "jobTitle" |> should equal "Engineer" + +[] +let ``getSchemas groups multiple items of same type`` () = + let html = """ + +
+ Widget A + 9.99 +
+
+ Widget B +
+""" + + let doc = HtmlDocument.Parse html + let schemas = HtmlRuntime.getSchemas doc + schemas |> should haveLength 1 + let group = schemas.[0] + group.Items |> should haveLength 2 + group.Items.[0].Properties |> Map.find "name" |> should equal "Widget A" + group.Items.[1].Properties |> Map.find "name" |> should equal "Widget B" + group.Properties |> should contain "name" + group.Properties |> should contain "price" + +[] +let ``getSchemas creates separate groups for different schema types`` () = + let html = """ + +
+ John +
+
+ Acme Corp +
+""" + + let doc = HtmlDocument.Parse html + let schemas = HtmlRuntime.getSchemas doc + schemas |> should haveLength 2 + +[] +let ``getSchemas uses content attribute for meta elements`` () = + let html = """ + +
+ +
+""" + + let doc = HtmlDocument.Parse html + let schemas = HtmlRuntime.getSchemas doc + schemas.[0].Items.[0].Properties |> Map.find "startDate" |> should equal "2024-01-15" + +[] +let ``HtmlSchemaGroup ToString formats correctly`` () = + let node = HtmlNode.NewText "dummy" + + let item = + { HtmlSchemaItem.Properties = Map.ofList [ "name", "Alice" ] + Html = node } + + let group = + { HtmlSchemaGroup.Name = "Person" + TypeUrl = "http://schema.org/Person" + Items = [| item |] + Properties = [| "name" |] } + + let result = group.ToString() + result |> should contain "Person" + result |> should contain "name" + result |> should contain "Alice" diff --git a/tests/FSharp.Data.Tests/HtmlProvider.fs b/tests/FSharp.Data.Tests/HtmlProvider.fs index 95b33ee7d..b8b145542 100644 --- a/tests/FSharp.Data.Tests/HtmlProvider.fs +++ b/tests/FSharp.Data.Tests/HtmlProvider.fs @@ -372,3 +372,43 @@ let ``Can infer DateTime and DateTimeOffset types correctly`` () = table.Rows.[0].DateOnly.GetType() |> should equal typeof table.Rows.[0].MixedDate.GetType() |> should equal typeof table.Rows.[0].DateWithOffset.GetType() |> should equal typeof + +// ============================================ +// Schema.org Microdata via HtmlProvider +// ============================================ + +[] +let schemaMicrodata = """ +
+ Jane Smith + Software Engineer + +
+
+ Bob Jones + Designer +
+""" + +type SchemaMicrodataHtml = HtmlProvider + +[] +let ``HtmlProvider exposes Schemas container for microdata`` () = + let doc = SchemaMicrodataHtml.GetSample() + let people = doc.Schemas.Person + people |> should not' (be null) + +[] +let ``HtmlProvider Schemas.Person returns all items`` () = + let doc = SchemaMicrodataHtml.GetSample() + let people = doc.Schemas.Person + people |> should haveLength 2 + +[] +let ``HtmlProvider Schemas.Person items have typed properties`` () = + let doc = SchemaMicrodataHtml.GetSample() + let people = doc.Schemas.Person + people.[0].Name |> should equal "Jane Smith" + people.[0].JobTitle |> should equal "Software Engineer" + people.[0].Url |> should equal "https://example.com" + people.[1].Name |> should equal "Bob Jones" From 240c687b82fe73593582dc9b2effeadfdff46b95 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 28 Feb 2026 07:35:46 +0000 Subject: [PATCH 2/8] ci: trigger CI checks From 753081b7b8589611bad58d43b48051dd7ba81e20 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 28 Feb 2026 12:22:11 +0000 Subject: [PATCH 3/8] Update DesignTime snapshot tests for HTML files with microdata The HtmlProvider now generates schema types for HTML elements with itemscope/itemtype/itemprop attributes. Update the expected signature snapshots for the 4 HTML test files that contain microdata (zoopla.html, zoopla2.html, ebay_cars.htm, imdb_chart.htm) so the DesignTime tests pass. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Html,ebay_cars.htm,False,False,.expected | 17 +++++++++++++ .../Html,imdb_chart.htm,False,False,.expected | 25 +++++++++++++++++++ .../Html,zoopla.html,False,False,.expected | 21 ++++++++++++++++ .../Html,zoopla2.html,False,False,.expected | 13 ++++++++++ 4 files changed, 76 insertions(+) diff --git a/tests/FSharp.Data.DesignTime.Tests/expected/Html,ebay_cars.htm,False,False,.expected b/tests/FSharp.Data.DesignTime.Tests/expected/Html,ebay_cars.htm,False,False,.expected index cbfaf92ff..c5604c4e8 100644 --- a/tests/FSharp.Data.DesignTime.Tests/expected/Html,ebay_cars.htm,False,False,.expected +++ b/tests/FSharp.Data.DesignTime.Tests/expected/Html,ebay_cars.htm,False,False,.expected @@ -28,6 +28,9 @@ class HtmlProvider : FDR.BaseTypes.HtmlDocument static member Parse: text:string -> HtmlProvider HtmlDocument.Create(false, ((new StringReader(text)) :> TextReader)) + member Schemas: HtmlProvider+SchemasContainer with get + this + member Tables: HtmlProvider+TablesContainer with get this @@ -94,6 +97,11 @@ class HtmlProvider+ListsContainer : FDR.BaseTypes.HtmlDocument HtmlList<_>.Create(rowConverter, this, "See also") +class HtmlProvider+SchemasContainer : FDR.BaseTypes.HtmlDocument + member SearchResultsPage: HtmlProvider+HtmlProvider+SearchResultsPage[] with get + this.GetSchema("SearchResultsPage").Items + + class HtmlProvider+TablesContainer : FDR.BaseTypes.HtmlDocument member ``Additional site navigation``: HtmlProvider+AdditionalSiteNavigation with get let rowConverter = new Func<_,_>(fun (row:string[]) -> @@ -124,6 +132,15 @@ class HtmlProvider+Menu4 : FDR.BaseTypes.HtmlList class HtmlProvider+Menu5 : FDR.BaseTypes.HtmlList +class HtmlProvider+SearchResultsPage : FDR.HtmlSchemaItem + member Html: HtmlNode with get + this.Html + + member Offers: string with get + this.Properties |> (MapModule.TryFind "offers" table) |> let value = "" + Lambda (option, Call (None, DefaultValue, [value, option])) + + class HtmlProvider+SeeAlso : FDR.BaseTypes.HtmlList class HtmlProvider+AdditionalSiteNavigation+Row : string * string diff --git a/tests/FSharp.Data.DesignTime.Tests/expected/Html,imdb_chart.htm,False,False,.expected b/tests/FSharp.Data.DesignTime.Tests/expected/Html,imdb_chart.htm,False,False,.expected index ed0343fde..dc8cf435d 100644 --- a/tests/FSharp.Data.DesignTime.Tests/expected/Html,imdb_chart.htm,False,False,.expected +++ b/tests/FSharp.Data.DesignTime.Tests/expected/Html,imdb_chart.htm,False,False,.expected @@ -28,6 +28,9 @@ class HtmlProvider : FDR.BaseTypes.HtmlDocument static member Parse: text:string -> HtmlProvider HtmlDocument.Create(false, ((new StringReader(text)) :> TextReader)) + member Schemas: HtmlProvider+SchemasContainer with get + this + member Tables: HtmlProvider+TablesContainer with get this @@ -70,6 +73,11 @@ class HtmlProvider+ListsContainer : FDR.BaseTypes.HtmlDocument HtmlList<_>.Create(rowConverter, this, "YOUR WATCHLIST") +class HtmlProvider+SchemasContainer : FDR.BaseTypes.HtmlDocument + member AggregateRating: HtmlProvider+HtmlProvider+AggregateRating[] with get + this.GetSchema("AggregateRating").Items + + class HtmlProvider+TablesContainer : FDR.BaseTypes.HtmlDocument member ``Take The Quiz!``: HtmlProvider+TakeTheQuiz with get let rowConverter = new Func<_,_>(fun (row:string[]) -> @@ -106,6 +114,23 @@ class HtmlProvider+TablesContainer : FDR.BaseTypes.HtmlDocument HtmlTable<_>.Create(rowConverter, this, "Top 250", true) +class HtmlProvider+AggregateRating : FDR.HtmlSchemaItem + member BestRating: string with get + this.Properties |> (MapModule.TryFind "bestRating" table) |> let value = "" + Lambda (option, Call (None, DefaultValue, [value, option])) + + member Html: HtmlNode with get + this.Html + + member RatingCount: string with get + this.Properties |> (MapModule.TryFind "ratingCount" table) |> let value = "" + Lambda (option, Call (None, DefaultValue, [value, option])) + + member RatingValue: string with get + this.Properties |> (MapModule.TryFind "ratingValue" table) |> let value = "" + Lambda (option, Call (None, DefaultValue, [value, option])) + + class HtmlProvider+ConsumerMainNav : FDR.BaseTypes.HtmlList class HtmlProvider+GetDiscovered : FDR.BaseTypes.HtmlList diff --git a/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla.html,False,False,.expected b/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla.html,False,False,.expected index 8db4418d1..4f1c3a30f 100644 --- a/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla.html,False,False,.expected +++ b/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla.html,False,False,.expected @@ -28,6 +28,9 @@ class HtmlProvider : FDR.BaseTypes.HtmlDocument static member Parse: text:string -> HtmlProvider HtmlDocument.Create(false, ((new StringReader(text)) :> TextReader)) + member Schemas: HtmlProvider+SchemasContainer with get + this + member Tables: HtmlProvider+TablesContainer with get this @@ -130,6 +133,11 @@ class HtmlProvider+ListsContainer : FDR.BaseTypes.HtmlDocument HtmlList<_>.Create(rowConverter, this, "Property type") +class HtmlProvider+SchemasContainer : FDR.BaseTypes.HtmlDocument + member WebPage: HtmlProvider+HtmlProvider+WebPage[] with get + this.GetSchema("WebPage").Items + + class HtmlProvider+TablesContainer : FDR.BaseTypes.HtmlDocument member ``Currently available to rent in London``: HtmlProvider+CurrentlyAvailableToRentInLondon with get let rowConverter = new Func<_,_>(fun (row:string[]) -> @@ -180,6 +188,19 @@ class HtmlProvider+PropertyFeatures : FDR.BaseTypes.HtmlList class HtmlProvider+PropertyType : FDR.BaseTypes.HtmlList +class HtmlProvider+WebPage : FDR.HtmlSchemaItem + member Breadcrumb: string with get + this.Properties |> (MapModule.TryFind "breadcrumb" table) |> let value = "" + Lambda (option, Call (None, DefaultValue, [value, option])) + + member Html: HtmlNode with get + this.Html + + member Name: string with get + this.Properties |> (MapModule.TryFind "name" table) |> let value = "" + Lambda (option, Call (None, DefaultValue, [value, option])) + + class HtmlProvider+CurrentlyAvailableToRentInLondon+Row : string * int * int * int * float member ``1 bed``: int with get (let _,t2,_,_,_ = this in t2) diff --git a/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla2.html,False,False,.expected b/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla2.html,False,False,.expected index 47eb0fa4d..d88cc1779 100644 --- a/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla2.html,False,False,.expected +++ b/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla2.html,False,False,.expected @@ -28,6 +28,9 @@ class HtmlProvider : FDR.BaseTypes.HtmlDocument static member Parse: text:string -> HtmlProvider HtmlDocument.Create(false, ((new StringReader(text)) :> TextReader)) + member Schemas: HtmlProvider+SchemasContainer with get + this + member Tables: HtmlProvider+TablesContainer with get this @@ -130,6 +133,11 @@ class HtmlProvider+ListsContainer : FDR.BaseTypes.HtmlDocument HtmlList<_>.Create(rowConverter, this, "Property type") +class HtmlProvider+SchemasContainer : FDR.BaseTypes.HtmlDocument + member SearchResultsPage: HtmlProvider+HtmlProvider+SearchResultsPage[] with get + this.GetSchema("SearchResultsPage").Items + + class HtmlProvider+TablesContainer : FDR.BaseTypes.HtmlDocument member ``Currently available to rent in London``: HtmlProvider+CurrentlyAvailableToRentInLondon with get let rowConverter = new Func<_,_>(fun (row:string[]) -> @@ -180,6 +188,11 @@ class HtmlProvider+List9 : FDR.BaseTypes.HtmlList class HtmlProvider+PropertyType : FDR.BaseTypes.HtmlList +class HtmlProvider+SearchResultsPage : FDR.HtmlSchemaItem + member Html: HtmlNode with get + this.Html + + class HtmlProvider+CurrentlyAvailableToRentInLondon+Row : string * int * int * int * float member ``1 bed``: int with get (let _,t2,_,_,_ = this in t2) From 69c35901190d7d74b28525f64e82a4ea76c6dfa5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 28 Feb 2026 12:23:33 +0000 Subject: [PATCH 4/8] ci: trigger CI checks From 9da8e6c94586b224c8a57554741d61f2786b343a Mon Sep 17 00:00:00 2001 From: Repo Assist Date: Sat, 28 Feb 2026 13:08:21 +0000 Subject: [PATCH 5/8] docs: improve HtmlProvider documentation with S&P 500 example and schema.org microdata section - Update introduction to clearly explain table naming, column type inference, and when to use the provider - Add Wikipedia S&P 500 companies example demonstrating groupBy analysis - Add schema.org microdata section showing ProductCatalog and mixed-page samples - Update NuGet stats example with improved regex and formatting - Rename Doctor Who groupBy variable for clarity - Add note about JSON-LD on JS-rendered sites (IMDB, eBay) - Remove outdated 'Introducing the provider' framing Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/library/HtmlProvider.fsx | 227 ++++++++++++++++++++++++++-------- 1 file changed, 178 insertions(+), 49 deletions(-) diff --git a/docs/library/HtmlProvider.fsx b/docs/library/HtmlProvider.fsx index 4daae4049..df26a5f00 100644 --- a/docs/library/HtmlProvider.fsx +++ b/docs/library/HtmlProvider.fsx @@ -28,28 +28,41 @@ Formatter.Register(fun (x: obj) (writer: TextWriter) -> fprintfn writer "%120A" # HTML Type Provider -This article demonstrates how to use the HTML type provider to read HTML tables files -in a statically typed way. +This article demonstrates how to use the HTML type provider to read HTML pages in a +statically typed way. The provider is useful for extracting data from pages that expose +information as HTML tables or via [schema.org microdata](https://schema.org/) markup. -The HTML Type Provider takes a sample HTML document as input and generates a type based on the data -present in the columns of that sample. The column names are obtained from the first (header) row. +The HTML Type Provider takes a sample HTML document as input — either a URL, a local file +path, or an inline HTML string — and generates F# types for the tables and microdata found +in that document. You can then load live data using the same or a different URL at runtime. -## Introducing the provider +## Parsing HTML Tables + +HTML tables (`` elements) are the primary target of the HTML Type Provider. +Each discovered table becomes a typed property on the generated `.Tables` container. + +**Table naming**: Table properties are named after the nearest preceding heading element +(`

`, `

`, etc.), the `

` element, or the table's `id`, `name`, `title`, +`summary`, or `aria-label` attribute — whichever is found first. If none are found the +table is named `Table1`, `Table2`, etc. by position. + +**Column types**: The provider infers column types (`int`, `float`, `DateTime`, `string`, +etc.) from the values in the sample document. Columns that cannot be fully inferred +default to `string`. -The type provider is located in the `FSharp.Data.dll` assembly. Assuming the assembly -is located in the `../../../bin` directory, we can load it in F# Interactive as follows: *) open FSharp.Data (** -### Parsing F1 Calendar Data +### Formula 1 Race Calendar (Wikipedia) -This example shows an example of using the HTML Type Provider to extract each row from a table on a Wikipedia page. +This example extracts the 2017 Formula 1 race calendar from Wikipedia. The page has +multiple tables; we access the one whose nearest preceding heading is "Calendar". -Usually with HTML files headers are demarked by using the `
` tag, however this is not true in general, so the provider assumes that the -first row is headers. (This behaviour is likely to get smarter in later releases). But it highlights a general problem about HTML's strictness. +The provider infers that `Round` is an integer and `Date` is a `DateTime` based on the +sample data. *) [] @@ -58,13 +71,6 @@ let F1_2017_URL = type F1_2017 = HtmlProvider -(** -The generated type provides a type space of tables that it has managed to parse out of the given HTML Document. -Each type's name is derived from either the id, title, name, summary or caption attributes/tags provided. If none of these -entities exist then the table will simply be named `Tablexx` where xx is the position in the HTML document if all of the tables were flattened out into a list. -The `Load` method allows reading the data from a file or web resource. We could also have used a web URL instead of a local file in the sample parameter of the type provider. -The following sample calls the `Load` method with an URL that points to a live version of the same page on Wikipedia. -*) // Download the table for the 2017 F1 calendar from Wikipedia let f1Calendar = F1_2017.Load(F1_2017_URL).Tables.Calendar @@ -74,45 +80,79 @@ let round = firstRow.Round let grandPrix = firstRow.``Grand Prix`` let date = firstRow.Date -// Print the round, location and date for each race, corresponding to a row +// Print the round, location and date for each race for row in f1Calendar.Rows do printfn "Race, round %A is hosted at %A on %A" row.Round row.``Grand Prix`` row.Date (*** include-fsi-merged-output ***) (** -The generated type has a property `Rows` that returns the data from the HTML file as a -collection of rows. We iterate over the rows using a `for` loop. As you can see the -(generated) type for rows has properties such as `Grand Prix`, `Circuit`, `Round` and `Date` that correspond -to the columns in the selected HTML table file. - -As you can see, the type provider also infers types of individual rows. The `Date` -property is inferred to be a `DateTime` (because the values in the sample file can all -be parsed as dates) while other columns are inferred as the correct type where possible. +The generated type has a property `Rows` that returns the data from the HTML page as a +collection of rows. We iterate over the rows using a `for` loop. The row type has +properties such as `Grand Prix`, `Circuit`, `Round` and `Date` that correspond to the +columns in the HTML table. + +The `Load` method accepts any URL, file path, or HTML string — we could load a different +season's page at runtime while still using the 2017 page for type inference at compile time. + +### Wikipedia S&P 500 Components + +This example queries the Wikipedia list of S&P 500 companies, which exposes symbol, +sector, and founding year for each constituent. Note that the table has no heading or +caption, so the provider names it `Table1` by position. *) -(** +[] +let SP500_URL = + "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies" + +type SP500 = HtmlProvider -### Parsing Nuget package stats +let sp500 = SP500.Load(SP500_URL) -This small sample shows how the HTML Type Provider can be used to scrape data from a website. In this example, we analyze the download counts of the FSharp.Data package on NuGet. -Note that we're using the live URL as the sample, so we can just use the default constructor as the runtime data will be the same as the compile time data. +// Table1 is the main constituents table (no caption on the Wikipedia page) +let companies = sp500.Tables.Table1 +// Show the first five companies with their sector +for row in companies.Rows |> Seq.truncate 5 do + printfn "%s (%s) — %s, founded %d" row.Symbol row.Security row.``GICS Sector`` row.Founded + +(*** include-fsi-merged-output ***) + +(** +We can also use standard F# collection functions to analyse the data — for example, +grouping companies by sector to see how many are in each: *) +let bySector = + companies.Rows + |> Seq.groupBy (fun r -> r.``GICS Sector``) + |> Seq.map (fun (sector, rows) -> sector, Seq.length rows) + |> Seq.sortByDescending snd + |> Seq.toArray + +(*** include-fsi-merged-output ***) + +(** + +### NuGet Package Statistics + +This example uses the HTML Type Provider to scrape download statistics for the +`FSharp.Data` package from NuGet. Because we pass the live URL as the type parameter, +the default constructor loads the same live page at runtime. +*) -// Configure the type provider type NugetStats = HtmlProvider<"https://www.nuget.org/packages/FSharp.Data"> -// load the live package stats for FSharp.Data +// Load the live package stats for FSharp.Data let rawStats = NugetStats().Tables.``Version History of FSharp.Data`` -// helper function to analyze version numbers from Nuget +// Helper to extract the minor version string (e.g. "6.4" from "6.4.0") let getMinorVersion (v: string) = - System.Text.RegularExpressions.Regex(@"\d.\d").Match(v).Value + System.Text.RegularExpressions.Regex(@"\d+\.\d+").Match(v).Value -// group by minor version and calculate the download count -let stats = +// Group downloads by minor version +let downloadsByMinorVersion = rawStats.Rows |> Seq.groupBy (fun r -> getMinorVersion r.Version) |> Seq.map (fun (k, xs) -> k, xs |> Seq.sumBy (fun x -> x.Downloads)) @@ -122,37 +162,126 @@ let stats = (** -### Getting statistics on Doctor Who - -This sample shows some more screen scraping from Wikipedia: +### Getting Statistics on Doctor Who (Multiple Tables) +Wikipedia pages often contain many tables. This example loads the Doctor Who episode +guide and aggregates viewing figures by director across the first series. *) -(*** define-output:doctorWhoChart ***) [] let DrWho = "https://en.wikipedia.org/wiki/List_of_Doctor_Who_episodes_(1963%E2%80%931989)" let doctorWho = new HtmlProvider() -// Get the average number of viewers for each doctor's series run -let viewersByDoctor = +// Get the average number of viewers for each director in Series 1 +let viewersByDirector = doctorWho.Tables.``Season 1 (1963-1964)``.Rows - |> Seq.groupBy (fun season -> season.``Directed by``) - |> Seq.map (fun (doctor, seasons) -> - let averaged = - seasons |> Seq.averageBy (fun season -> season.``UK viewers (millions)``) + |> Seq.groupBy (fun episode -> episode.``Directed by``) + |> Seq.map (fun (director, episodes) -> + let avgViewers = + episodes |> Seq.averageBy (fun e -> e.``UK viewers (millions)``) - doctor, averaged) + director, avgViewers) |> Seq.toArray +(*** include-fsi-merged-output ***) + +(** + +## Schema.org Microdata + +In addition to HTML tables, the provider also understands +[HTML microdata](https://html.spec.whatwg.org/multipage/microdata.html) — the +`itemscope` / `itemtype` / `itemprop` attributes defined by the +[schema.org](https://schema.org/) vocabulary. When the sample document contains +`itemscope` elements, the provider generates a `.Schemas` container with one typed +property per schema type found. + +The example below uses an inline HTML string as the type parameter sample. At runtime +you can call `.Load(url)` to parse any live page that uses the same schema types. +*) + +[] +let ProductCatalogSample = + """ +
+ FSharp.Data + F# data-access library + F# Foundation + FSHARP-DATA +
+
+ Newtonsoft.Json + Popular high-performance JSON framework for .NET + James Newton-King + NEWTONSOFT-JSON +
+""" + +type ProductCatalog = HtmlProvider + +let catalog = ProductCatalog.Parse(ProductCatalogSample) + +// Schemas.Product returns an array of typed Product items +for product in catalog.Schemas.Product do + printfn "%s — %s (SKU: %s)" product.Name product.Brand product.Sku (*** include-fsi-merged-output ***) (** +Each item in `Schemas.Product` has one string property per `itemprop` name discovered in +the sample, with names Pascal-cased for F# convention. The `.Html` property gives +access to the underlying `HtmlNode` for cases where you need to traverse sub-elements. + +You can mix microdata and table extraction in the same type: +*) + +[] +let MixedPageSample = + """ +
+ Grace Hopper + Rear Admiral + Yale University +
+ + + + + +
YearAchievement
1944Debugged the Harvard Mark II
1952First compiler (A-0 System)
1959Co-designed COBOL
+""" + +type PersonPage = HtmlProvider + +let page = PersonPage.Parse(MixedPageSample) + +// Access the microdata +let person = page.Schemas.Person |> Array.head +printfn "Name: %s, Title: %s" person.Name person.JobTitle + +// Access the table +for row in page.Tables.Table1.Rows do + printfn "%d: %s" row.Year row.Achievement + +(*** include-fsi-merged-output ***) + +(** + +> **Note on modern websites**: Many websites today (including IMDB and eBay) use +> client-side JavaScript frameworks (React, Next.js, etc.) that render content +> dynamically — meaning the HTML served to the browser contains little or no table +> data or microdata. These sites often embed structured data as +> [JSON-LD](https://json-ld.org/) inside a ` + + + + + + + + +
YearEvent
2002F# created by Don Syme at Microsoft Research Cambridge
2005First public release of F#
2010F# ships with Visual Studio 2010
2020F# 5.0 released
+ +""" + +type WikipediaArticle = HtmlProvider + +let wiki = WikipediaArticle.Parse(WikipediaArticleSample) + +// Access the JSON-LD Article metadata +let article = wiki.JsonLd.Article |> Array.head +printfn "Title: %s" article.Name +printfn "Description: %s" article.Description +printfn "Published: %s | Modified: %s" article.DatePublished article.DateModified +printfn "URL: %s" article.Url + +(*** include-fsi-merged-output ***) + +(** +The `.JsonLd` container has one property per `@type` found in the JSON-LD blocks. Each +property returns an array of items — so `wiki.JsonLd.Article` is an +`WikipediaArticle+Article[]`. Each item has one `string` property per top-level scalar +field (strings, numbers, booleans), with names Pascal-cased for F# convention. A `.Raw` +property gives the original JSON text if you need to access complex nested values. + +### Wikipedia timeline article: tables + JSON-LD metadata + +The next example shows combining the Wikipedia HTML table (a timeline of events) with the +JSON-LD article metadata, all via a single provider type: +*) + +// Access the events table in the same page +for row in wiki.Tables.Table1.Rows do + printfn "%d: %s" row.Year row.Event + +(*** include-fsi-merged-output ***) + +(** + +### Multiple JSON-LD types on one page + +Some pages include multiple JSON-LD blocks, e.g. a `WebPage` descriptor alongside the +`Article`. The provider generates separate typed properties for each `@type`: +*) + +[] +let WikipediaWithWebPageSample = + """ + + + + + +""" + +type WikipediaWithWebPage = HtmlProvider + +let wikiMulti = WikipediaWithWebPage.Parse(WikipediaWithWebPageSample) + +// Both JsonLd types are available as separate typed properties +printfn "WebPage name: %s" wikiMulti.JsonLd.WebPage.[0].Name +printfn "Article name: %s" wikiMulti.JsonLd.Article.[0].Name +printfn "Article published: %s" wikiMulti.JsonLd.Article.[0].DatePublished + +(*** include-fsi-merged-output ***) + +(** + +### Accessing raw JSON for complex properties + +For properties with nested object values (such as `image`, `author`, or `publisher` +in a Wikipedia article), only scalar top-level fields are reflected as typed properties. +Use the `.Raw` property to access the full original JSON and parse it further with +`JsonProvider` or `JsonValue.Parse` if needed: +*) + +[] +let WikipediaPersonSample = + """ + + + + + + + + + +
YearAchievement
1936Turing machine concept published
1939Bombe code-breaking machine
1950Turing Test proposed
+ +""" + +type WikipediaPerson = HtmlProvider + +let turingPage = WikipediaPerson.Parse(WikipediaPersonSample) + +// JSON-LD article metadata +let turingArticle = turingPage.JsonLd.Article.[0] +printfn "Subject: %s" turingArticle.Name +printfn "Published: %s" turingArticle.DatePublished +printfn "License: %s" turingArticle.License + +// Timeline table from the article body +for row in turingPage.Tables.Table1.Rows do + printfn "%d — %s" row.Year row.Achievement + +(*** include-fsi-merged-output ***) + +(** + +## Summary of structured data formats + +| Format | HTML mechanism | Provider access | Typical use | +|---|---|---|---| +| Tables | `` elements | `.Tables.TableName` | Tabular data, statistics | +| Microdata | `itemscope`/`itemprop` attributes | `.Schemas.TypeName` | Inline product/event/person markup | +| JSON-LD | ` + + +""" + + use reader = new StringReader(htmlContent) + let htmlDoc = HtmlDocument.Create(false, reader) + let article = htmlDoc.GetJsonLd("Article") + article.Items |> should haveLength 1 + article.Items.[0].Properties |> Map.find "name" |> should equal "Test Article" + +[] +let ``HtmlDocument.Create returns empty JSON-LD when no script blocks present`` () = + let htmlContent = "

No JSON-LD here

" + + use reader = new StringReader(htmlContent) + let htmlDoc = HtmlDocument.Create(false, reader) + (fun () -> htmlDoc.GetJsonLd("Article") |> ignore) |> should throw typeof + +[] +let ``HtmlDocument.Create handles multiple JSON-LD script blocks of different types`` () = + let htmlContent = + """ + + + + + +""" + + use reader = new StringReader(htmlContent) + let htmlDoc = HtmlDocument.Create(false, reader) + htmlDoc.GetJsonLd("Article").Items |> should haveLength 1 + htmlDoc.GetJsonLd("WebSite").Items |> should haveLength 1 + htmlDoc.GetJsonLd("WebSite").Items.[0].Properties |> Map.find "url" |> should equal "https://example.com" + +[] +let ``JSON-LD parser handles array top-level`` () = + let htmlContent = + """ + + + + +""" + + use reader = new StringReader(htmlContent) + let htmlDoc = HtmlDocument.Create(false, reader) + htmlDoc.GetJsonLd("Article").Items |> should haveLength 2 + +[] +let ``JSON-LD Raw property contains the original JSON`` () = + let htmlContent = + """ + + + + +""" + + use reader = new StringReader(htmlContent) + let htmlDoc = HtmlDocument.Create(false, reader) + let item = htmlDoc.GetJsonLd("Article").Items.[0] + item.Raw |> should contain "Test" + item.Raw |> should contain "Desc" diff --git a/tests/FSharp.Data.Tests/HtmlProvider.fs b/tests/FSharp.Data.Tests/HtmlProvider.fs index b8b145542..30a56a435 100644 --- a/tests/FSharp.Data.Tests/HtmlProvider.fs +++ b/tests/FSharp.Data.Tests/HtmlProvider.fs @@ -412,3 +412,120 @@ let ``HtmlProvider Schemas.Person items have typed properties`` () = people.[0].JobTitle |> should equal "Software Engineer" people.[0].Url |> should equal "https://example.com" people.[1].Name |> should equal "Bob Jones" + +// ============================================ +// JSON-LD via HtmlProvider +// ============================================ + +[] +let jsonLdArticle = + """ + + + +

Content here

+""" + +type JsonLdArticleHtml = HtmlProvider + +[] +let ``HtmlProvider exposes JsonLd container for JSON-LD`` () = + let doc = JsonLdArticleHtml.GetSample() + let articles = doc.JsonLd.Article + articles |> should not' (be null) + +[] +let ``HtmlProvider JsonLd.Article returns items`` () = + let doc = JsonLdArticleHtml.GetSample() + let articles = doc.JsonLd.Article + articles |> should haveLength 1 + +[] +let ``HtmlProvider JsonLd.Article items have typed properties`` () = + let doc = JsonLdArticleHtml.GetSample() + let article = doc.JsonLd.Article.[0] + article.Name |> should equal "F# Data Library" + article.Headline |> should equal "F# Data: Library for Data Access" + article.Url |> should equal "https://fsprojects.github.io/FSharp.Data" + article.DatePublished |> should equal "2012-01-01" + +[] +let ``HtmlProvider JsonLd.Article item has Raw property`` () = + let doc = JsonLdArticleHtml.GetSample() + let article = doc.JsonLd.Article.[0] + article.Raw |> should contain "F# Data Library" + +[] +let jsonLdMultipleTypes = + """ + + + + + +""" + +type JsonLdMultiTypeHtml = HtmlProvider + +[] +let ``HtmlProvider JsonLd supports multiple type groups`` () = + let doc = JsonLdMultiTypeHtml.GetSample() + doc.JsonLd.WebPage |> should haveLength 1 + doc.JsonLd.Article |> should haveLength 1 + doc.JsonLd.WebPage.[0].Name |> should equal "My Page" + doc.JsonLd.Article.[0].Name |> should equal "My Article" + +[] +let jsonLdMixed = + """ + + + + +
+ + +
Column1Column2
Value1Value2
+ +""" + +type JsonLdMixedHtml = HtmlProvider + +[] +let ``HtmlProvider can combine JSON-LD and table extraction`` () = + let doc = JsonLdMixedHtml.GetSample() + doc.JsonLd.Article.[0].Name |> should equal "Mixed Page" + doc.Tables.Table1.Rows |> should haveLength 1 From 66e96edceb4365db656196c18c5d5a9d917fc561 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 28 Feb 2026 14:42:35 +0000 Subject: [PATCH 8/8] ci: trigger CI checks