diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 0d3fb92f6..8e21e786d 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -2,6 +2,8 @@ ## 8.1.0-beta +- Add schema.org microdata support to `HtmlProvider`: when an HTML document contains elements with `itemscope`/`itemtype`/`itemprop` attributes, the provider now generates a typed `Schemas` container (e.g. `doc.Schemas.Person`) with one strongly-typed property per `itemprop` name discovered in the sample (closes #611) +- Add JSON-LD support to `HtmlProvider`: when an HTML document contains ` + + + + + + + + +
YearEvent
2002F# created by Don Syme at Microsoft Research Cambridge
2005First public release of F#
2010F# ships with Visual Studio 2010
2020F# 5.0 released
+ +""" + +type WikipediaArticle = HtmlProvider + +let wiki = WikipediaArticle.Parse(WikipediaArticleSample) + +// Access the JSON-LD Article metadata +let article = wiki.JsonLd.Article |> Array.head +printfn "Title: %s" article.Name +printfn "Description: %s" article.Description +printfn "Published: %s | Modified: %s" article.DatePublished article.DateModified +printfn "URL: %s" article.Url + +(*** include-fsi-merged-output ***) + +(** +The `.JsonLd` container has one property per `@type` found in the JSON-LD blocks. Each +property returns an array of items — so `wiki.JsonLd.Article` is an +`WikipediaArticle+Article[]`. Each item has one `string` property per top-level scalar +field (strings, numbers, booleans), with names Pascal-cased for F# convention. A `.Raw` +property gives the original JSON text if you need to access complex nested values. + +### Wikipedia timeline article: tables + JSON-LD metadata + +The next example shows combining the Wikipedia HTML table (a timeline of events) with the +JSON-LD article metadata, all via a single provider type: +*) + +// Access the events table in the same page +for row in wiki.Tables.Table1.Rows do + printfn "%d: %s" row.Year row.Event + +(*** include-fsi-merged-output ***) + +(** + +### Multiple JSON-LD types on one page + +Some pages include multiple JSON-LD blocks, e.g. a `WebPage` descriptor alongside the +`Article`. The provider generates separate typed properties for each `@type`: +*) + +[] +let WikipediaWithWebPageSample = + """ + + + + + +""" + +type WikipediaWithWebPage = HtmlProvider + +let wikiMulti = WikipediaWithWebPage.Parse(WikipediaWithWebPageSample) + +// Both JsonLd types are available as separate typed properties +printfn "WebPage name: %s" wikiMulti.JsonLd.WebPage.[0].Name +printfn "Article name: %s" wikiMulti.JsonLd.Article.[0].Name +printfn "Article published: %s" wikiMulti.JsonLd.Article.[0].DatePublished + +(*** include-fsi-merged-output ***) + +(** + +### Accessing raw JSON for complex properties + +For properties with nested object values (such as `image`, `author`, or `publisher` +in a Wikipedia article), only scalar top-level fields are reflected as typed properties. +Use the `.Raw` property to access the full original JSON and parse it further with +`JsonProvider` or `JsonValue.Parse` if needed: +*) + +[] +let WikipediaPersonSample = + """ + + + + + + + + + +
YearAchievement
1936Turing machine concept published
1939Bombe code-breaking machine
1950Turing Test proposed
+ +""" + +type WikipediaPerson = HtmlProvider + +let turingPage = WikipediaPerson.Parse(WikipediaPersonSample) + +// JSON-LD article metadata +let turingArticle = turingPage.JsonLd.Article.[0] +printfn "Subject: %s" turingArticle.Name +printfn "Published: %s" turingArticle.DatePublished +printfn "License: %s" turingArticle.License + +// Timeline table from the article body +for row in turingPage.Tables.Table1.Rows do + printfn "%d — %s" row.Year row.Achievement + +(*** include-fsi-merged-output ***) + +(** + +## Summary of structured data formats + +| Format | HTML mechanism | Provider access | Typical use | +|---|---|---|---| +| Tables | `` elements | `.Tables.TableName` | Tabular data, statistics | +| Microdata | `itemscope`/`itemprop` attributes | `.Schemas.TypeName` | Inline product/event/person markup | +| JSON-LD | ` + + +""" + + use reader = new StringReader(htmlContent) + let htmlDoc = HtmlDocument.Create(false, reader) + let article = htmlDoc.GetJsonLd("Article") + article.Items |> should haveLength 1 + article.Items.[0].Properties |> Map.find "name" |> should equal "Test Article" + +[] +let ``HtmlDocument.Create returns empty JSON-LD when no script blocks present`` () = + let htmlContent = "

No JSON-LD here

" + + use reader = new StringReader(htmlContent) + let htmlDoc = HtmlDocument.Create(false, reader) + (fun () -> htmlDoc.GetJsonLd("Article") |> ignore) |> should throw typeof + +[] +let ``HtmlDocument.Create handles multiple JSON-LD script blocks of different types`` () = + let htmlContent = + """ + + + + + +""" + + use reader = new StringReader(htmlContent) + let htmlDoc = HtmlDocument.Create(false, reader) + htmlDoc.GetJsonLd("Article").Items |> should haveLength 1 + htmlDoc.GetJsonLd("WebSite").Items |> should haveLength 1 + htmlDoc.GetJsonLd("WebSite").Items.[0].Properties |> Map.find "url" |> should equal "https://example.com" + +[] +let ``JSON-LD parser handles array top-level`` () = + let htmlContent = + """ + + + + +""" + + use reader = new StringReader(htmlContent) + let htmlDoc = HtmlDocument.Create(false, reader) + htmlDoc.GetJsonLd("Article").Items |> should haveLength 2 + +[] +let ``JSON-LD Raw property contains the original JSON`` () = + let htmlContent = + """ + + + + +""" + + use reader = new StringReader(htmlContent) + let htmlDoc = HtmlDocument.Create(false, reader) + let item = htmlDoc.GetJsonLd("Article").Items.[0] + item.Raw |> should contain "Test" + item.Raw |> should contain "Desc" diff --git a/tests/FSharp.Data.Core.Tests/HtmlRuntimeTypes.fs b/tests/FSharp.Data.Core.Tests/HtmlRuntimeTypes.fs index e7262923f..4a7e907df 100644 --- a/tests/FSharp.Data.Core.Tests/HtmlRuntimeTypes.fs +++ b/tests/FSharp.Data.Core.Tests/HtmlRuntimeTypes.fs @@ -98,4 +98,104 @@ let ``HtmlDefinitionList record ToString should handle empty definitions`` () = result |> should contain name // Note: Generic HtmlList and HtmlTable tests are complex due to internal type usage -// The record types (HtmlList and HtmlDefinitionList) above provide good coverage for the 0% areas \ No newline at end of file +// The record types (HtmlList and HtmlDefinitionList) above provide good coverage for the 0% areas +// ============================================ +// Schema.org Microdata Tests +// ============================================ + +[] +let ``getSchemas returns empty list when no microdata present`` () = + let doc = HtmlDocument.Parse "

No microdata here.

" + let schemas = HtmlRuntime.getSchemas doc + schemas |> List.length |> should equal 0 + +[] +let ``getSchemas finds itemscope items and groups by itemtype`` () = + let html = """ + +
+ Jane Doe + Engineer +
+""" + + let doc = HtmlDocument.Parse html + let schemas = HtmlRuntime.getSchemas doc + schemas |> should haveLength 1 + let group = schemas.[0] + group.TypeUrl |> should equal "http://schema.org/Person" + group.Name |> should equal "Person" + group.Items |> should haveLength 1 + group.Items.[0].Properties |> Map.find "name" |> should equal "Jane Doe" + group.Items.[0].Properties |> Map.find "jobTitle" |> should equal "Engineer" + +[] +let ``getSchemas groups multiple items of same type`` () = + let html = """ + +
+ Widget A + 9.99 +
+
+ Widget B +
+""" + + let doc = HtmlDocument.Parse html + let schemas = HtmlRuntime.getSchemas doc + schemas |> should haveLength 1 + let group = schemas.[0] + group.Items |> should haveLength 2 + group.Items.[0].Properties |> Map.find "name" |> should equal "Widget A" + group.Items.[1].Properties |> Map.find "name" |> should equal "Widget B" + group.Properties |> should contain "name" + group.Properties |> should contain "price" + +[] +let ``getSchemas creates separate groups for different schema types`` () = + let html = """ + +
+ John +
+
+ Acme Corp +
+""" + + let doc = HtmlDocument.Parse html + let schemas = HtmlRuntime.getSchemas doc + schemas |> should haveLength 2 + +[] +let ``getSchemas uses content attribute for meta elements`` () = + let html = """ + +
+ +
+""" + + let doc = HtmlDocument.Parse html + let schemas = HtmlRuntime.getSchemas doc + schemas.[0].Items.[0].Properties |> Map.find "startDate" |> should equal "2024-01-15" + +[] +let ``HtmlSchemaGroup ToString formats correctly`` () = + let node = HtmlNode.NewText "dummy" + + let item = + { HtmlSchemaItem.Properties = Map.ofList [ "name", "Alice" ] + Html = node } + + let group = + { HtmlSchemaGroup.Name = "Person" + TypeUrl = "http://schema.org/Person" + Items = [| item |] + Properties = [| "name" |] } + + let result = group.ToString() + result |> should contain "Person" + result |> should contain "name" + result |> should contain "Alice" diff --git a/tests/FSharp.Data.DesignTime.Tests/expected/Html,ebay_cars.htm,False,False,.expected b/tests/FSharp.Data.DesignTime.Tests/expected/Html,ebay_cars.htm,False,False,.expected index cbfaf92ff..c5604c4e8 100644 --- a/tests/FSharp.Data.DesignTime.Tests/expected/Html,ebay_cars.htm,False,False,.expected +++ b/tests/FSharp.Data.DesignTime.Tests/expected/Html,ebay_cars.htm,False,False,.expected @@ -28,6 +28,9 @@ class HtmlProvider : FDR.BaseTypes.HtmlDocument static member Parse: text:string -> HtmlProvider HtmlDocument.Create(false, ((new StringReader(text)) :> TextReader)) + member Schemas: HtmlProvider+SchemasContainer with get + this + member Tables: HtmlProvider+TablesContainer with get this @@ -94,6 +97,11 @@ class HtmlProvider+ListsContainer : FDR.BaseTypes.HtmlDocument HtmlList<_>.Create(rowConverter, this, "See also") +class HtmlProvider+SchemasContainer : FDR.BaseTypes.HtmlDocument + member SearchResultsPage: HtmlProvider+HtmlProvider+SearchResultsPage[] with get + this.GetSchema("SearchResultsPage").Items + + class HtmlProvider+TablesContainer : FDR.BaseTypes.HtmlDocument member ``Additional site navigation``: HtmlProvider+AdditionalSiteNavigation with get let rowConverter = new Func<_,_>(fun (row:string[]) -> @@ -124,6 +132,15 @@ class HtmlProvider+Menu4 : FDR.BaseTypes.HtmlList class HtmlProvider+Menu5 : FDR.BaseTypes.HtmlList +class HtmlProvider+SearchResultsPage : FDR.HtmlSchemaItem + member Html: HtmlNode with get + this.Html + + member Offers: string with get + this.Properties |> (MapModule.TryFind "offers" table) |> let value = "" + Lambda (option, Call (None, DefaultValue, [value, option])) + + class HtmlProvider+SeeAlso : FDR.BaseTypes.HtmlList class HtmlProvider+AdditionalSiteNavigation+Row : string * string diff --git a/tests/FSharp.Data.DesignTime.Tests/expected/Html,imdb_chart.htm,False,False,.expected b/tests/FSharp.Data.DesignTime.Tests/expected/Html,imdb_chart.htm,False,False,.expected index ed0343fde..dc8cf435d 100644 --- a/tests/FSharp.Data.DesignTime.Tests/expected/Html,imdb_chart.htm,False,False,.expected +++ b/tests/FSharp.Data.DesignTime.Tests/expected/Html,imdb_chart.htm,False,False,.expected @@ -28,6 +28,9 @@ class HtmlProvider : FDR.BaseTypes.HtmlDocument static member Parse: text:string -> HtmlProvider HtmlDocument.Create(false, ((new StringReader(text)) :> TextReader)) + member Schemas: HtmlProvider+SchemasContainer with get + this + member Tables: HtmlProvider+TablesContainer with get this @@ -70,6 +73,11 @@ class HtmlProvider+ListsContainer : FDR.BaseTypes.HtmlDocument HtmlList<_>.Create(rowConverter, this, "YOUR WATCHLIST") +class HtmlProvider+SchemasContainer : FDR.BaseTypes.HtmlDocument + member AggregateRating: HtmlProvider+HtmlProvider+AggregateRating[] with get + this.GetSchema("AggregateRating").Items + + class HtmlProvider+TablesContainer : FDR.BaseTypes.HtmlDocument member ``Take The Quiz!``: HtmlProvider+TakeTheQuiz with get let rowConverter = new Func<_,_>(fun (row:string[]) -> @@ -106,6 +114,23 @@ class HtmlProvider+TablesContainer : FDR.BaseTypes.HtmlDocument HtmlTable<_>.Create(rowConverter, this, "Top 250", true) +class HtmlProvider+AggregateRating : FDR.HtmlSchemaItem + member BestRating: string with get + this.Properties |> (MapModule.TryFind "bestRating" table) |> let value = "" + Lambda (option, Call (None, DefaultValue, [value, option])) + + member Html: HtmlNode with get + this.Html + + member RatingCount: string with get + this.Properties |> (MapModule.TryFind "ratingCount" table) |> let value = "" + Lambda (option, Call (None, DefaultValue, [value, option])) + + member RatingValue: string with get + this.Properties |> (MapModule.TryFind "ratingValue" table) |> let value = "" + Lambda (option, Call (None, DefaultValue, [value, option])) + + class HtmlProvider+ConsumerMainNav : FDR.BaseTypes.HtmlList class HtmlProvider+GetDiscovered : FDR.BaseTypes.HtmlList diff --git a/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla.html,False,False,.expected b/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla.html,False,False,.expected index 8db4418d1..4f1c3a30f 100644 --- a/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla.html,False,False,.expected +++ b/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla.html,False,False,.expected @@ -28,6 +28,9 @@ class HtmlProvider : FDR.BaseTypes.HtmlDocument static member Parse: text:string -> HtmlProvider HtmlDocument.Create(false, ((new StringReader(text)) :> TextReader)) + member Schemas: HtmlProvider+SchemasContainer with get + this + member Tables: HtmlProvider+TablesContainer with get this @@ -130,6 +133,11 @@ class HtmlProvider+ListsContainer : FDR.BaseTypes.HtmlDocument HtmlList<_>.Create(rowConverter, this, "Property type") +class HtmlProvider+SchemasContainer : FDR.BaseTypes.HtmlDocument + member WebPage: HtmlProvider+HtmlProvider+WebPage[] with get + this.GetSchema("WebPage").Items + + class HtmlProvider+TablesContainer : FDR.BaseTypes.HtmlDocument member ``Currently available to rent in London``: HtmlProvider+CurrentlyAvailableToRentInLondon with get let rowConverter = new Func<_,_>(fun (row:string[]) -> @@ -180,6 +188,19 @@ class HtmlProvider+PropertyFeatures : FDR.BaseTypes.HtmlList class HtmlProvider+PropertyType : FDR.BaseTypes.HtmlList +class HtmlProvider+WebPage : FDR.HtmlSchemaItem + member Breadcrumb: string with get + this.Properties |> (MapModule.TryFind "breadcrumb" table) |> let value = "" + Lambda (option, Call (None, DefaultValue, [value, option])) + + member Html: HtmlNode with get + this.Html + + member Name: string with get + this.Properties |> (MapModule.TryFind "name" table) |> let value = "" + Lambda (option, Call (None, DefaultValue, [value, option])) + + class HtmlProvider+CurrentlyAvailableToRentInLondon+Row : string * int * int * int * float member ``1 bed``: int with get (let _,t2,_,_,_ = this in t2) diff --git a/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla2.html,False,False,.expected b/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla2.html,False,False,.expected index 47eb0fa4d..d88cc1779 100644 --- a/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla2.html,False,False,.expected +++ b/tests/FSharp.Data.DesignTime.Tests/expected/Html,zoopla2.html,False,False,.expected @@ -28,6 +28,9 @@ class HtmlProvider : FDR.BaseTypes.HtmlDocument static member Parse: text:string -> HtmlProvider HtmlDocument.Create(false, ((new StringReader(text)) :> TextReader)) + member Schemas: HtmlProvider+SchemasContainer with get + this + member Tables: HtmlProvider+TablesContainer with get this @@ -130,6 +133,11 @@ class HtmlProvider+ListsContainer : FDR.BaseTypes.HtmlDocument HtmlList<_>.Create(rowConverter, this, "Property type") +class HtmlProvider+SchemasContainer : FDR.BaseTypes.HtmlDocument + member SearchResultsPage: HtmlProvider+HtmlProvider+SearchResultsPage[] with get + this.GetSchema("SearchResultsPage").Items + + class HtmlProvider+TablesContainer : FDR.BaseTypes.HtmlDocument member ``Currently available to rent in London``: HtmlProvider+CurrentlyAvailableToRentInLondon with get let rowConverter = new Func<_,_>(fun (row:string[]) -> @@ -180,6 +188,11 @@ class HtmlProvider+List9 : FDR.BaseTypes.HtmlList class HtmlProvider+PropertyType : FDR.BaseTypes.HtmlList +class HtmlProvider+SearchResultsPage : FDR.HtmlSchemaItem + member Html: HtmlNode with get + this.Html + + class HtmlProvider+CurrentlyAvailableToRentInLondon+Row : string * int * int * int * float member ``1 bed``: int with get (let _,t2,_,_,_ = this in t2) diff --git a/tests/FSharp.Data.Tests/HtmlProvider.fs b/tests/FSharp.Data.Tests/HtmlProvider.fs index 95b33ee7d..30a56a435 100644 --- a/tests/FSharp.Data.Tests/HtmlProvider.fs +++ b/tests/FSharp.Data.Tests/HtmlProvider.fs @@ -372,3 +372,160 @@ let ``Can infer DateTime and DateTimeOffset types correctly`` () = table.Rows.[0].DateOnly.GetType() |> should equal typeof table.Rows.[0].MixedDate.GetType() |> should equal typeof table.Rows.[0].DateWithOffset.GetType() |> should equal typeof + +// ============================================ +// Schema.org Microdata via HtmlProvider +// ============================================ + +[] +let schemaMicrodata = """ +
+ Jane Smith + Software Engineer + +
+
+ Bob Jones + Designer +
+""" + +type SchemaMicrodataHtml = HtmlProvider + +[] +let ``HtmlProvider exposes Schemas container for microdata`` () = + let doc = SchemaMicrodataHtml.GetSample() + let people = doc.Schemas.Person + people |> should not' (be null) + +[] +let ``HtmlProvider Schemas.Person returns all items`` () = + let doc = SchemaMicrodataHtml.GetSample() + let people = doc.Schemas.Person + people |> should haveLength 2 + +[] +let ``HtmlProvider Schemas.Person items have typed properties`` () = + let doc = SchemaMicrodataHtml.GetSample() + let people = doc.Schemas.Person + people.[0].Name |> should equal "Jane Smith" + people.[0].JobTitle |> should equal "Software Engineer" + people.[0].Url |> should equal "https://example.com" + people.[1].Name |> should equal "Bob Jones" + +// ============================================ +// JSON-LD via HtmlProvider +// ============================================ + +[] +let jsonLdArticle = + """ + + + +

Content here

+""" + +type JsonLdArticleHtml = HtmlProvider + +[] +let ``HtmlProvider exposes JsonLd container for JSON-LD`` () = + let doc = JsonLdArticleHtml.GetSample() + let articles = doc.JsonLd.Article + articles |> should not' (be null) + +[] +let ``HtmlProvider JsonLd.Article returns items`` () = + let doc = JsonLdArticleHtml.GetSample() + let articles = doc.JsonLd.Article + articles |> should haveLength 1 + +[] +let ``HtmlProvider JsonLd.Article items have typed properties`` () = + let doc = JsonLdArticleHtml.GetSample() + let article = doc.JsonLd.Article.[0] + article.Name |> should equal "F# Data Library" + article.Headline |> should equal "F# Data: Library for Data Access" + article.Url |> should equal "https://fsprojects.github.io/FSharp.Data" + article.DatePublished |> should equal "2012-01-01" + +[] +let ``HtmlProvider JsonLd.Article item has Raw property`` () = + let doc = JsonLdArticleHtml.GetSample() + let article = doc.JsonLd.Article.[0] + article.Raw |> should contain "F# Data Library" + +[] +let jsonLdMultipleTypes = + """ + + + + + +""" + +type JsonLdMultiTypeHtml = HtmlProvider + +[] +let ``HtmlProvider JsonLd supports multiple type groups`` () = + let doc = JsonLdMultiTypeHtml.GetSample() + doc.JsonLd.WebPage |> should haveLength 1 + doc.JsonLd.Article |> should haveLength 1 + doc.JsonLd.WebPage.[0].Name |> should equal "My Page" + doc.JsonLd.Article.[0].Name |> should equal "My Article" + +[] +let jsonLdMixed = + """ + + + + +
+ + +
Column1Column2
Value1Value2
+ +""" + +type JsonLdMixedHtml = HtmlProvider + +[] +let ``HtmlProvider can combine JSON-LD and table extraction`` () = + let doc = JsonLdMixedHtml.GetSample() + doc.JsonLd.Article.[0].Name |> should equal "Mixed Page" + doc.Tables.Table1.Rows |> should haveLength 1