diff --git a/package-lock.json b/package-lock.json index 4686088d94d4..520c5e5cb78a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -67,9 +67,11 @@ "lowlight": "^3.3.0", "markdownlint-rule-helpers": "^0.25.0", "mdast-util-from-markdown": "^2.0.2", + "mdast-util-gfm": "^3.1.0", "mdast-util-to-hast": "^13.2.1", "mdast-util-to-markdown": "2.1.2", "mdast-util-to-string": "^4.0.0", + "micromark-extension-gfm": "^3.0.0", "next": "^16.1.5", "ora": "^9.0.0", "parse5": "7.1.2", @@ -97,6 +99,7 @@ "swr": "^2.2.5", "tcp-port-used": "1.0.2", "tsx": "^4.19.4", + "typescript": "^5.8.3", "unified": "^11.0.5", "unist-util-find": "^3.0.0", "unist-util-visit": "^5.0.0", @@ -175,7 +178,6 @@ "robots-parser": "^3.0.1", "sass": "^1.77.8", "start-server-and-test": "^2.0.11", - "typescript": "^5.8.3", "unist-util-remove": "^4.0.0", "unist-util-visit-parents": "6.0.1", "vitest": "^4.0.4", @@ -312,6 +314,7 @@ "version": "2.3.0", "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz", "integrity": "sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw==", + "peer": true, "dependencies": { "@jridgewell/gen-mapping": "^0.3.5", "@jridgewell/trace-mapping": "^0.3.24" @@ -497,6 +500,7 @@ "version": "7.23.3", "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.23.3.tgz", "integrity": "sha512-BmR4bWbDIoFJmJ9z2cZ8Gmm2MXgEDgjdWgpKmKWUt54UGFJdlj31ECtbaDvCG/qVdG3AQ1SfpZEs01lUFbzLOQ==", + "peer": true, "engines": { "node": ">=6.9.0" } @@ -534,12 +538,14 @@ "node_modules/@babel/core/node_modules/convert-source-map": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", - "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==" + "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==", + "peer": true }, "node_modules/@babel/core/node_modules/semver": { "version": "6.3.1", "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", + "peer": true, "bin": { "semver": "bin/semver.js" } @@ -573,6 +579,7 @@ "version": "7.22.15", "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.22.15.tgz", "integrity": "sha512-y6EEzULok0Qvz8yyLkCvVX+02ic+By2UdOhylwUOvOn9dvYc9mKICJuuU1n1XBI02YWsNsnrY1kc6DVbjcXbtw==", + "peer": true, "dependencies": { "@babel/compat-data": "^7.22.9", "@babel/helper-validator-option": "^7.22.15", @@ -588,6 +595,7 @@ "version": "5.1.1", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "peer": true, "dependencies": { "yallist": "^3.0.2" } @@ -596,6 +604,7 @@ "version": "6.3.1", "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", + "peer": true, "bin": { "semver": "bin/semver.js" } @@ -603,7 +612,8 @@ "node_modules/@babel/helper-compilation-targets/node_modules/yallist": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", - "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==" + "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==", + "peer": true }, "node_modules/@babel/helper-environment-visitor": { "version": "7.22.20", @@ -651,6 +661,7 @@ "version": "7.23.3", "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.23.3.tgz", "integrity": "sha512-7bBs4ED9OmswdfDzpz4MpWgSrV7FXlc3zIagvLFjS5H+Mk7Snr21vQ6QwrsoCGMfNC4e4LQPdoULEt4ykz0SRQ==", + "peer": true, "dependencies": { "@babel/helper-environment-visitor": "^7.22.20", "@babel/helper-module-imports": "^7.22.15", @@ -677,6 +688,7 @@ "version": "7.22.5", "resolved": "https://registry.npmjs.org/@babel/helper-simple-access/-/helper-simple-access-7.22.5.tgz", "integrity": "sha512-n0H99E/K+Bika3++WNL17POvo4rKWZ7lZEp1Q+fStVbUi8nxPQEBOlTmCOxW/0JsS56SKKQ+ojAe2pHKJHN35w==", + "peer": true, "dependencies": { "@babel/types": "^7.22.5" }, @@ -715,6 +727,7 @@ "version": "7.22.15", "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.22.15.tgz", "integrity": "sha512-bMn7RmyFjY/mdECUbgn9eoSY4vqvacUnS9i9vGAGttgFWesO6B4CYWA7XlpbWgBt71iv/hfbPlynohStqnu5hA==", + "peer": true, "engines": { "node": ">=6.9.0" } @@ -723,6 +736,7 @@ "version": "7.26.10", "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.10.tgz", "integrity": "sha512-UPYc3SauzZ3JGgj87GgZ89JVdC5dj0AoetR5Bw6wj4niittNyFh6+eOGonYvJ1ao6B8lEa3Q3klS7ADZ53bc5g==", + "peer": true, "dependencies": { "@babel/template": "^7.26.9", "@babel/types": "^7.26.10" @@ -2639,7 +2653,6 @@ "resolved": "https://registry.npmjs.org/@octokit/core/-/core-7.0.6.tgz", "integrity": "sha512-DhGl4xMVFGVIyMwswXeyzdL4uXD5OGILGX5N8Y+f6W7LhC1Ze2poSNrkF/fedpVDHEEZ+PHFW0vL14I+mm8K3Q==", "license": "MIT", - "peer": true, "dependencies": { "@octokit/auth-token": "^6.0.0", "@octokit/graphql": "^9.0.3", @@ -2940,7 +2953,6 @@ "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", "license": "Apache-2.0", - "peer": true, "engines": { "node": ">=8.0.0" } @@ -3312,7 +3324,6 @@ "integrity": "sha512-vSMYtL/zOcFpvJCW71Q/OEGQb7KYBPAdKh35WNSkaZA75JlAO8ED8UN6GUNTm3drWomcbcqRPFqQbLae8yBTdg==", "devOptional": true, "license": "Apache-2.0", - "peer": true, "dependencies": { "playwright": "1.56.1" }, @@ -4136,7 +4147,6 @@ "integrity": "sha512-wGA0NX93b19/dZC1J18tKWVIYWyyF2ZjT9vin/NRu0qzzvfVzWjs04iq2rQ3H65vCTQYlRqs3YHfY7zjdV+9Kw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@types/body-parser": "*", "@types/express-serve-static-core": "^5.0.0", @@ -4298,7 +4308,6 @@ "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.20.tgz", "integrity": "sha512-IPaCZN7PShZK/3t6Q87pfTkRm6oLTd4vztyoj+cbHUF1g3FfVb2tFIL79uCRKEfv16AhqDMBywP2VW3KIZUvcg==", "license": "MIT", - "peer": true, "dependencies": { "@types/prop-types": "*", "csstype": "^3.0.2" @@ -4310,7 +4319,6 @@ "integrity": "sha512-nf22//wEbKXusP6E9pfOCDwFdHAX4u172eaJI4YkDRQEZiorm6KfYnSC2SWLDMVWUOWPERmJnN0ujeAfTBLvrw==", "devOptional": true, "license": "MIT", - "peer": true, "peerDependencies": { "@types/react": "^18.0.0" } @@ -4481,7 +4489,6 @@ "integrity": "sha512-pUXGCuHnnKw6PyYq93lLRiZm3vjuslIy7tus1lIQTYVK9bL8XBgJnCWm8a0KcTtHC84Yya1Q6rtll+duSMj0dg==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "8.39.1", "@typescript-eslint/types": "8.39.1", @@ -5127,7 +5134,6 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", - "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -5157,7 +5163,6 @@ "version": "8.17.1", "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz", "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==", - "peer": true, "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", @@ -5723,7 +5728,6 @@ } ], "license": "MIT", - "peer": true, "dependencies": { "caniuse-lite": "^1.0.30001733", "electron-to-chromium": "^1.5.199", @@ -6005,7 +6009,6 @@ "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.12.tgz", "integrity": "sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==", "license": "MIT", - "peer": true, "dependencies": { "cheerio-select": "^2.1.0", "dom-serializer": "^2.0.0", @@ -7277,7 +7280,6 @@ "integrity": "sha512-TS9bTNIryDzStCpJN93aC5VRSW3uTx9sClUn4B87pwiCaJh220otoI0X8mJKr+VcPtniMdN8GKjlwgWGUv5ZKA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/regexpp": "^4.12.1", @@ -7339,7 +7341,6 @@ "integrity": "sha512-82GZUjRS0p/jganf6q1rEO25VSoHH0hKPCTrgillPjdI/3bgBhAE1QzHrHTizjpRvy6pGAvKjDJtk2pF9NDq8w==", "dev": true, "license": "MIT", - "peer": true, "bin": { "eslint-config-prettier": "bin/cli.js" }, @@ -7615,7 +7616,6 @@ "integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@rtsao/scc": "^1.1.0", "array-includes": "^3.1.9", @@ -8811,6 +8811,7 @@ "node_modules/gensync": { "version": "1.0.0-beta.2", "license": "MIT", + "peer": true, "engines": { "node": ">=6.9.0" } @@ -9017,7 +9018,6 @@ "resolved": "https://registry.npmjs.org/graphql/-/graphql-16.9.0.tgz", "integrity": "sha512-GGTKBX4SD7Wdb8mqeDLni2oaRGYQWjWHGKPQ24ZMnUtKfcsVoiv4uX8+LJr1K6U5VW2Lu1BwJnj7uiori0YtRw==", "dev": true, - "peer": true, "engines": { "node": "^12.22.0 || ^14.16.0 || ^16.0.0 || >=17.0.0" } @@ -10390,7 +10390,6 @@ "resolved": "https://registry.npmjs.org/jiti/-/jiti-2.6.1.tgz", "integrity": "sha512-ekilCSN1jwRvIbgeg/57YFh8qQDNbwDb9xT/qu2DAHbFFZUicIl4ygVaAvzveMhMVr3LnpSKTNnwt8PoOfmKhQ==", "dev": true, - "peer": true, "bin": { "jiti": "lib/jiti-cli.mjs" } @@ -10517,6 +10516,7 @@ "node_modules/json5": { "version": "2.2.3", "license": "MIT", + "peer": true, "bin": { "json5": "lib/cli.js" }, @@ -11239,9 +11239,10 @@ "integrity": "sha512-MFETx3tbTjE7Uk6vvnWINA/1iJ7LuMdO4fcq8UfF0pRbj01aGLduVvQcRyswuACJdpnHgg8E3rQLhaRdNEJS0w==" }, "node_modules/mdast-util-gfm": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.0.0.tgz", - "integrity": "sha512-dgQEX5Amaq+DuUqf26jJqSK9qgixgd6rYDHAv4aTBuA92cTknZlKpPfa86Z/s8Dj8xsAQpFfBmPUHWJBWqS4Bw==", + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.1.0.tgz", + "integrity": "sha512-0ulfdQOM3ysHhCJ1p06l0b0VKlhU0wuQs3thxZQagjcjPrlFRqY215uZGHHJan9GEAXd9MbfPjFJz+qMkVR6zQ==", + "license": "MIT", "dependencies": { "mdast-util-from-markdown": "^2.0.0", "mdast-util-gfm-autolink-literal": "^2.0.0", @@ -11601,6 +11602,7 @@ "version": "3.0.0", "resolved": "https://registry.npmjs.org/micromark-extension-gfm/-/micromark-extension-gfm-3.0.0.tgz", "integrity": "sha512-vsKArQsicm7t0z2GugkCKtZehqUm31oeGBV/KVSorWSy8ZlNAv7ytjFhvaryUiCUJYqs+NoE6AFhpQvBTM6Q4w==", + "license": "MIT", "dependencies": { "micromark-extension-gfm-autolink-literal": "^2.0.0", "micromark-extension-gfm-footnote": "^2.0.0", @@ -13479,7 +13481,6 @@ "integrity": "sha512-hutraynyn31F+Bifme+Ps9Vq59hKuUCz7H1kDOcBs+2oGguKkWTU50bBWrtz34OUWmIwpBTWDxaRPXrIXkgvmQ==", "devOptional": true, "license": "Apache-2.0", - "peer": true, "bin": { "playwright-core": "cli.js" }, @@ -13543,7 +13544,6 @@ "integrity": "sha512-QQtaxnoDJeAkDvDKWCLiwIXkTgRhwYDEQCghU9Z6q03iyek/rxRh/2lC3HB7P8sWT2xC/y5JDctPLBIGzHKbhw==", "dev": true, "license": "MIT", - "peer": true, "bin": { "prettier": "bin/prettier.cjs" }, @@ -13713,7 +13713,6 @@ "version": "18.3.1", "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz", "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", - "peer": true, "dependencies": { "loose-envify": "^1.1.0" }, @@ -13734,7 +13733,6 @@ "version": "18.3.1", "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz", "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==", - "peer": true, "dependencies": { "loose-envify": "^1.1.0", "scheduler": "^0.23.2" @@ -14402,7 +14400,6 @@ "integrity": "sha512-d0NoFH4v6SjEK7BoX810Jsrhj7IQSYHAHLi/iSpgqKc7LaIDshFRlSg5LOymf9FqQhxEHs2W5ZQXlvy0KD45Uw==", "devOptional": true, "license": "MIT", - "peer": true, "dependencies": { "chokidar": "^4.0.0", "immutable": "^5.0.2", @@ -15340,7 +15337,6 @@ "resolved": "https://registry.npmjs.org/styled-components/-/styled-components-5.3.11.tgz", "integrity": "sha512-uuzIIfnVkagcVHv9nE0VPlHPSCmXIUGKfJ42LNjxCCTDTL5sgnJ8Z7GZBq0EnLYGln77tPpEpExt2+qa+cZqSw==", "license": "MIT", - "peer": true, "dependencies": { "@babel/helper-module-imports": "^7.0.0", "@babel/traverse": "^7.4.5", @@ -15599,7 +15595,6 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, @@ -15923,9 +15918,7 @@ "version": "5.8.3", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz", "integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==", - "dev": true, "license": "Apache-2.0", - "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -16280,7 +16273,6 @@ "dev": true, "hasInstallScript": true, "license": "MIT", - "peer": true, "dependencies": { "napi-postinstall": "^0.2.2" }, @@ -16483,7 +16475,6 @@ "integrity": "sha512-ZWyE8YXEXqJrrSLvYgrRP7p62OziLW7xI5HYGWFzOvupfAlrLvURSzv/FyGyy0eidogEM3ujU+kUG1zuHgb6Ug==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "esbuild": "^0.25.0", "fdir": "^6.5.0", @@ -16592,7 +16583,6 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, diff --git a/package.json b/package.json index 520c2d9d8baa..a612cf398cb7 100644 --- a/package.json +++ b/package.json @@ -217,9 +217,11 @@ "lowlight": "^3.3.0", "markdownlint-rule-helpers": "^0.25.0", "mdast-util-from-markdown": "^2.0.2", + "mdast-util-gfm": "^3.1.0", "mdast-util-to-hast": "^13.2.1", "mdast-util-to-markdown": "2.1.2", "mdast-util-to-string": "^4.0.0", + "micromark-extension-gfm": "^3.0.0", "next": "^16.1.5", "ora": "^9.0.0", "parse5": "7.1.2", @@ -247,6 +249,7 @@ "swr": "^2.2.5", "tcp-port-used": "1.0.2", "tsx": "^4.19.4", + "typescript": "^5.8.3", "unified": "^11.0.5", "unist-util-find": "^3.0.0", "unist-util-visit": "^5.0.0", @@ -325,7 +328,6 @@ "robots-parser": "^3.0.1", "sass": "^1.77.8", "start-server-and-test": "^2.0.11", - "typescript": "^5.8.3", "unist-util-remove": "^4.0.0", "unist-util-visit-parents": "6.0.1", "vitest": "^4.0.4", diff --git a/src/search/scripts/scrape/README.md b/src/search/scripts/scrape/README.md index 538052f51b96..5434aba6d179 100644 --- a/src/search/scripts/scrape/README.md +++ b/src/search/scripts/scrape/README.md @@ -1,16 +1,36 @@ # Scraping for General Search -We need to scrape each page on the Docs site and use the data we scrape to index Elasticsearch. +We fetch each page's content via the Article API and use the structured data to index Elasticsearch. This replaced the previous approach of rendering full HTML pages and scraping them with cheerio. We currently only scrape for **general search** results. Autocomplete search data is generated from analytics events and GPT queries. +## How it works + +The scrape script starts by loading all indexable pages, then for each page it calls the Article API (`/api/article?pathname=`) on the local server. The API returns structured JSON with the page's title, intro, breadcrumbs, and markdown body. The markdown is parsed into an AST with GFM support (so tables are handled cleanly), navigational headings like "Further reading" are filtered out, and the full content (including code blocks) is converted to plain text for indexing. + +The implementation lives in `lib/build-records-from-api.ts`. + ## CLI Script -Before running the scraping script ensure that the server is running in another terminal with `npm run general-search-scrape-server` +Before running the scraping script, start the server in another terminal: + +```bash +npm run general-search-scrape-server +``` + +Then run the scrape: -Run the script with `npm run general-search-scrape -- ` +```bash +npm run general-search-scrape -- +``` + +To scrape a specific language and version: + +```bash +npx tsx src/search/scripts/scrape/scrape-cli.ts -l en -V fpt +``` After a successful run it will generate a series of JSON files with the page data of every page of the Docs site into the passed directory. diff --git a/src/search/scripts/scrape/lib/build-records-from-api.ts b/src/search/scripts/scrape/lib/build-records-from-api.ts new file mode 100644 index 000000000000..ea618f092531 --- /dev/null +++ b/src/search/scripts/scrape/lib/build-records-from-api.ts @@ -0,0 +1,488 @@ +/** + * Build search records using the Article API instead of HTML scraping. + * + * This module provides functions to fetch article content via the Article API + * and convert it to search index records. This approach is faster and more + * reliable than HTML scraping because it: + * - Fetches pre-rendered markdown directly (no full HTML rendering) + * - Uses structured metadata (title, intro, breadcrumbs) from API + * - Parses headings from markdown using mdast (proper AST parsing) + */ + +import Bottleneck from 'bottleneck' +import chalk from 'chalk' +import dotenv from 'dotenv' +import boxen from 'boxen' +import { fromMarkdown } from 'mdast-util-from-markdown' +import { toString } from 'mdast-util-to-string' +import { visit } from 'unist-util-visit' +import { gfm } from 'micromark-extension-gfm' +import { gfmFromMarkdown } from 'mdast-util-gfm' +import GithubSlugger from 'github-slugger' +import type { Node, Parent } from 'unist' + +import languages from '@/languages/lib/languages-server' +import getPopularPages from '@/search/scripts/scrape/lib/popular-pages' +import { getAllVersionsKeyFromIndexVersion } from '@/search/lib/elasticsearch-versions' +import { fetchWithRetry } from '@/frame/lib/fetch-utils' + +import type { + Record, + FailedPage, + Page, + Permalink, + Config, + Redirects, +} from '@/search/scripts/scrape/types' + +// Same ignored headings as the HTML scraping approach +const IGNORED_HEADING_SLUGS = new Set(['in-this-article', 'further-reading', 'prerequisites']) + +// Known translations of the 3 ignored navigational headings. +// These are used as a fallback when github-slugger produces non-ASCII slugs +// that don't match the English slug set above. +const IGNORED_HEADING_TEXTS = new Set([ + // English (lowercase) + 'in this article', + 'further reading', + 'prerequisites', + // Japanese (ja) + 'この記事の内容', + '参考資料', + '前提条件', + // Chinese (zh) + '本文内容', + '延伸阅读', + '先决条件', + // Korean (ko) + '이 문서의 내용', + '추가 참고 자료', + '필수 조건', + // Spanish (es) + 'en este artículo', + 'información adicional', + 'requisitos previos', + // Portuguese (pt) + 'neste artigo', + 'leitura adicional', + 'pré-requisitos', + // Russian (ru) + 'в этой статье', + 'дополнительные материалы', + 'необходимые компоненты', + // French (fr) + 'dans cet article', + 'pour aller plus loin', + 'prérequis', + // German (de) + 'in diesem artikel', + 'weiterführende themen', + 'voraussetzungen', +]) + +// Default port matches build-records.ts for consistency +const DEFAULT_PORT = 4002 + +dotenv.config() + +// These defaults are known to work fine in GitHub Actions. +const MAX_CONCURRENT = parseInt(process.env.BUILD_RECORDS_MAX_CONCURRENT || '5', 10) +const MIN_TIME = parseInt(process.env.BUILD_RECORDS_MIN_TIME || '200', 10) + +// These products forcibly get a popularity of 0 +const FORCE_0_POPULARITY_PRODUCTS = new Set(['contributing']) + +const pageMarker = chalk.green('|') +const recordMarker = chalk.grey('.') + +interface HeadingNode extends Node { + type: 'heading' + depth: number +} + +export interface ArticleApiResponse { + meta: { + title: string + intro: string + product: string + breadcrumbs?: Array<{ href: string; title: string }> + } + body: string +} + +export interface ArticleApiErrorResponse { + error: string +} + +export type ArticleApiResult = ArticleApiResponse | ArticleApiErrorResponse + +/** + * Parse markdown into an AST with GFM support (tables, strikethrough, etc.). + */ +function parseMarkdown(markdown: string) { + return fromMarkdown(markdown, { + extensions: [gfm()], + mdastExtensions: [gfmFromMarkdown()], + }) +} + +// Block container types whose children should be separated by newlines. +// These contain other block-level nodes (paragraphs, lists, etc.) and +// toString() would concatenate them without whitespace, producing tokens +// like "SSH.Make" that the ES tokenizer can't split. +const BLOCK_CONTAINER_TYPES = new Set([ + 'root', + 'blockquote', + 'list', + 'listItem', + 'table', + 'tableRow', + 'footnoteDefinition', +]) + +/** + * Convert an AST to plain text, joining block-level children with newlines. + * Recurses into block containers (lists, blockquotes, etc.) so that nested + * block boundaries also get whitespace — not just the root level. + */ +function astToPlainText(node: Node): string { + const parent = node as Parent + if (!parent.children) { + return toString(node) + } + + if (BLOCK_CONTAINER_TYPES.has(node.type)) { + return parent.children.map((child) => astToPlainText(child)).join('\n') + } + + // Leaf blocks (paragraph, heading, tableCell) and inline nodes: + // concatenate inline text directly. + return toString(node) +} + +/** + * Extract headings and plain-text content from markdown in a single AST pass. + * Headings are extracted first, then the full AST (including code blocks) + * is converted to plain text so that terms inside code examples remain + * searchable (e.g. `ssh_url`, `ssh://`). + */ +export function extractFromMarkdown(markdown: string): { headings: string; content: string } { + const ast = parseMarkdown(markdown) + + // 1. Extract h2 headings from the AST + const headings: string[] = [] + const slugger = new GithubSlugger() + + visit(ast, (node: Node) => { + if (node.type !== 'heading') return + const headingNode = node as HeadingNode + if (headingNode.depth !== 2) return + + const headingText = toString(node) + const slug = slugger.slug(headingText) + + // Skip navigational headings by slug or known translated text + if (IGNORED_HEADING_SLUGS.has(slug)) return + if (IGNORED_HEADING_TEXTS.has(headingText.toLowerCase().trim())) return + + headings.push(headingText) + }) + + // 2. Convert full AST to plain text (code blocks are kept so that terms + // appearing only in code examples remain searchable). + const content = astToPlainText(ast) + + return { headings: headings.join('\n'), content } +} + +/** + * Extract h2 headings from markdown content using mdast parser. + * Filters out navigational headings (in-this-article, further-reading, prerequisites). + */ +export function extractHeadingsFromMarkdown(markdown: string): string { + return extractFromMarkdown(markdown).headings +} + +/** + * Convert markdown to plain text for search indexing using mdast. + * This extracts all text content from the markdown AST, including code blocks. + */ +export function markdownToPlainText(markdown: string): string { + return extractFromMarkdown(markdown).content +} + +/** + * Convert Article API response to a search record. + */ +export function articleApiResponseToRecord(pathname: string, data: ArticleApiResponse): Record { + // Build breadcrumbs string (excluding the last one which is the current page) + const breadcrumbsArray = data.meta.breadcrumbs?.map((b) => b.title) || [] + const breadcrumbs = + breadcrumbsArray + .slice(0, breadcrumbsArray.length > 1 ? -1 : breadcrumbsArray.length) + .join(' / ') || '' + + // Single-pass extraction: parse markdown once to get both headings and content + const { headings, content: bodyText } = extractFromMarkdown(data.body) + + // Combine intro with body if intro isn't already in body + const intro = data.meta.intro || '' + const content = + intro && !bodyText.includes(intro.trim()) + ? `${intro.trim()}\n${bodyText.trim()}`.trim() + : bodyText.trim() + + return { + objectID: pathname, + breadcrumbs, + title: data.meta.title, + headings, + content, + intro, + toplevel: breadcrumbsArray[0] || '', + } +} + +export interface FetchResult { + record: Record | null + failure: FailedPage | null +} + +function isErrorResponse(data: ArticleApiResult): data is ArticleApiErrorResponse { + return 'error' in data +} + +/** + * Fetch article from API and convert to search record. + */ +export async function fetchArticleAsRecord( + pathname: string, + baseUrl: string = `http://localhost:${DEFAULT_PORT}`, +): Promise { + const url = `${baseUrl}/api/article?pathname=${encodeURIComponent(pathname)}` + + try { + const response = await fetchWithRetry(url, undefined, { + retries: 3, + throwHttpErrors: false, + timeout: 60000, + }) + if (!response.ok) { + let errorMessage = `HTTP ${response.status}: ${response.statusText}` + let errorType = `HTTP ${response.status}` + try { + const body = await response.json() + if (body && typeof body.error === 'string') { + errorMessage = body.error + errorType = 'API Error' + } + } catch { + /* ignore JSON parse errors */ + } + return { + record: null, + failure: { + url: pathname, + error: errorMessage, + errorType, + }, + } + } + + const data = (await response.json()) as ArticleApiResult + + // Check for error response (e.g., archived pages) + if (isErrorResponse(data)) { + return { + record: null, + failure: { + url: pathname, + error: data.error, + errorType: 'API Error', + }, + } + } + + const record = articleApiResponseToRecord(pathname, data) + return { record, failure: null } + } catch (error) { + const message = error instanceof Error ? error.message : String(error) + const errorName = error instanceof Error ? error.name : undefined + const errorCode = (error as { code?: string }).code + + // Prefer structured timeout indicators (name/code), with a documented + // fallback to message inspection for environments that only expose text. + const isTimeout = + errorName === 'AbortError' || + errorCode === 'ETIMEDOUT' || + errorCode === 'ECONNABORTED' || + message.toLowerCase().includes('timeout') + + return { + record: null, + failure: { + url: pathname, + error: message, + errorType: isTimeout ? 'Timeout' : 'Network Error', + }, + } + } +} + +export interface BuildRecordsResult { + records: Record[] + failedPages: FailedPage[] +} + +/** + * Build search records for a given index using the Article API. + * This is a drop-in replacement for buildRecords from build-records.ts. + */ +export default async function buildRecordsFromApi( + indexName: string, + indexablePages: Page[], + indexVersion: string, + languageCode: string, + redirects: Redirects, + config: Config = {} as Config, +): Promise { + const pageVersion = getAllVersionsKeyFromIndexVersion(indexVersion) + const { noMarkers, docsInternalDataPath } = config + + console.log(`\n\nBuilding records for index '${indexName}' (${languages[languageCode].name})`) + + const records: Record[] = [] + const failedPages: FailedPage[] = [] + + // Filter pages for this language and version + const pages = indexablePages + .filter((page) => page.languageCode === languageCode) + .filter((page) => page.permalinks.some((permalink) => permalink.pageVersion === pageVersion)) + + // Get permalinks for this language and version, deduplicating by href. + // Cross-product children can cause the same page to appear multiple + // times in the tree under different parents. + const seen = new Set() + const permalinks = pages + .map((page) => + page.permalinks.find( + (permalink) => + permalink.languageCode === languageCode && permalink.pageVersion === pageVersion, + ), + ) + .filter((permalink): permalink is Permalink => { + if (!permalink) return false + if (seen.has(permalink.href)) return false + seen.add(permalink.href) + return true + }) + + const popularPages = docsInternalDataPath + ? await getPopularPages(docsInternalDataPath, redirects, indexVersion, languageCode) + : {} + + console.log('indexable pages', indexablePages.length) + console.log('pages in index', pages.length) + console.log('permalinks in index', permalinks.length) + console.log(pageMarker, 'denotes pages') + console.log(recordMarker, 'denotes records derived from sections of pages') + console.log('popular page ratios', Object.keys(popularPages).length) + + const hasPopularPages = Object.keys(popularPages).length > 0 + const baseUrl = `http://localhost:${DEFAULT_PORT}` + + // Use Bottleneck for rate limiting + const limiter = new Bottleneck({ + maxConcurrent: MAX_CONCURRENT, + minTime: MIN_TIME, + }) + + // Process all permalinks with rate limiting + const fetchPromises = permalinks.map((permalink) => + limiter.schedule(async () => { + const result = await fetchArticleAsRecord(permalink.href, baseUrl) + + if (result.failure) { + result.failure.relativePath = permalink.relativePath + failedPages.push(result.failure) + if (!noMarkers) process.stdout.write(chalk.red('✗')) + return null + } + + if (result.record) { + // Apply popularity + const pathArticle = permalink.relativePath.replace('/index.md', '').replace('.md', '') + let popularity = (hasPopularPages && popularPages[pathArticle]) || 0.0 + + if (FORCE_0_POPULARITY_PRODUCTS.size) { + const product = result.record.objectID.split('/')[2] + if (FORCE_0_POPULARITY_PRODUCTS.has(product)) { + popularity = 0.0 + } + } + + result.record.popularity = popularity + if (!noMarkers) process.stdout.write(pageMarker + recordMarker) + return result.record + } + + return null + }), + ) + + const results = await Promise.all(fetchPromises) + for (const record of results) { + if (record) records.push(record) + } + + console.log('\nrecords in index: ', records.length) + + // Report failed pages (same format as build-records.ts) + if (failedPages.length > 0) { + const failureCount = failedPages.length + const header = chalk.bold.red(`${failureCount} page(s) failed to scrape\n\n`) + + const failureList = failedPages + .slice(0, 10) + .map((failure, idx) => { + const number = chalk.gray(`${idx + 1}. `) + const errorType = chalk.yellow(failure.errorType) + const pathLine = failure.relativePath + ? `\n${chalk.cyan(' Path: ')}${failure.relativePath}` + : '' + const urlLine = failure.url ? `\n${chalk.cyan(' URL: ')}${failure.url}` : '' + const errorLine = `\n${chalk.gray(` Error: ${failure.error}`)}` + + return `${number}${errorType}${pathLine}${urlLine}${errorLine}` + }) + .join('\n\n') + + const remaining = + failureCount > 10 ? `\n\n${chalk.gray(`... and ${failureCount - 10} more`)}` : '' + + const boxContent = header + failureList + remaining + const box = boxen(boxContent, { + title: chalk.red('⚠ Failed Pages'), + padding: 1, + borderColor: 'yellow', + }) + + console.log(`\n${box}\n`) + + console.log( + chalk.yellow( + `💡 Tip: These failures won't stop the scraping process. The script will continue with the remaining pages.`, + ), + ) + + if (failedPages.some((f) => f.errorType === 'Timeout')) { + console.log( + chalk.gray( + ` For timeout errors, try: export BUILD_RECORDS_MAX_CONCURRENT=50 (currently ${MAX_CONCURRENT})`, + ), + ) + } + } + + return { records, failedPages } +} diff --git a/src/search/scripts/scrape/lib/build-records.ts b/src/search/scripts/scrape/lib/build-records.ts deleted file mode 100644 index d5a3b0336b2a..000000000000 --- a/src/search/scripts/scrape/lib/build-records.ts +++ /dev/null @@ -1,234 +0,0 @@ -import eventToPromise from 'event-to-promise' -import chalk from 'chalk' -import dotenv from 'dotenv' -import boxen from 'boxen' - -import languages from '@/languages/lib/languages-server' -import parsePageSectionsIntoRecords from '@/search/scripts/scrape/lib/parse-page-sections-into-records' -import getPopularPages from '@/search/scripts/scrape/lib/popular-pages' -import domwaiter from '@/search/scripts/scrape/lib/domwaiter' -import { getAllVersionsKeyFromIndexVersion } from '@/search/lib/elasticsearch-versions' - -import type { Page, Permalink, Record, Config, Redirects } from '@/search/scripts/scrape/types' - -// Custom error class to replace got's HTTPError -class HTTPError extends Error { - response: { ok: boolean; statusCode?: number } - request: { requestUrl?: { pathname?: string } } - - constructor( - message: string, - response: { ok: boolean; statusCode?: number }, - request: { requestUrl?: { pathname?: string } }, - ) { - super(message) - this.name = 'HTTPError' - this.response = response - this.request = request - } -} - -const pageMarker = chalk.green('|') -const recordMarker = chalk.grey('.') -const port = 4002 - -dotenv.config() - -// These defaults are known to work fine in GitHub Actions. -// For local development, you can override these in your local .env file. -// For example: -// echo 'BUILD_RECORDS_MAX_CONCURRENT=5' >> .env -// echo 'BUILD_RECORDS_MIN_TIME=200' >> .env -const MAX_CONCURRENT = parseInt(process.env.BUILD_RECORDS_MAX_CONCURRENT || '5', 10) -const MIN_TIME = parseInt(process.env.BUILD_RECORDS_MIN_TIME || '200', 10) - -// These products, forcibly always get a popularity of 0 independent of -// their actual popularity which comes from an external JSON file. -// The objective for this is to reduce their search result ranking -// when multiple docs match on a certain keyword(s). -const FORCE_0_POPULARITY_PRODUCTS = new Set(['contributing']) - -interface FailedPage { - url?: string - relativePath?: string - error: string - errorType: string -} - -export interface BuildRecordsResult { - records: Record[] - failedPages: FailedPage[] -} - -export default async function buildRecords( - indexName: string, - indexablePages: Page[], - indexVersion: string, - languageCode: string, - redirects: Redirects, - config: Config = {} as Config, -): Promise { - // Determine the page version from the index version - const pageVersion = getAllVersionsKeyFromIndexVersion(indexVersion) - - const { noMarkers, docsInternalDataPath } = config - console.log(`\n\nBuilding records for index '${indexName}' (${languages[languageCode].name})`) - const records: Record[] = [] - const pages = indexablePages - // exclude pages that are not in the current language - .filter((page) => page.languageCode === languageCode) - // exclude pages that don't have a permalink for the current product version - .filter((page) => page.permalinks.some((permalink) => permalink.pageVersion === pageVersion)) - - // Find the approve permalink for the given language and GitHub product variant (dotcom v enterprise) - const permalinks = pages - .map((page) => { - return page.permalinks.find((permalink) => { - return permalink.languageCode === languageCode && permalink.pageVersion === pageVersion - }) - }) - .map((permalink) => { - if (permalink) { - permalink.url = `http://localhost:${port}${permalink.href}` - } - return permalink - }) - .filter((permalink): permalink is Permalink => permalink !== undefined) - - const popularPages = docsInternalDataPath - ? await getPopularPages(docsInternalDataPath, redirects, indexVersion, languageCode) - : {} - - console.log('indexable pages', indexablePages.length) - console.log('pages in index', pages.length) - console.log('permalinks in index', permalinks.length) - console.log(pageMarker, 'denotes pages') - console.log(recordMarker, 'denotes records derived from sections of pages') - console.log('popular page ratios', Object.keys(popularPages).length) - - const hasPopularPages = Object.keys(popularPages).length > 0 - - // Track failed pages - const failedPages: FailedPage[] = [] - - const waiter = domwaiter(permalinks, { maxConcurrent: MAX_CONCURRENT, minTime: MIN_TIME }) - .on('page', (page) => { - if (!noMarkers) process.stdout.write(pageMarker) - const newRecord = parsePageSectionsIntoRecords(page) - const pathArticle = page.relativePath.replace('/index.md', '').replace('.md', '') - let popularity = (hasPopularPages && popularPages[pathArticle]) || 0.0 - if (FORCE_0_POPULARITY_PRODUCTS.size) { - const product = newRecord.objectID.split('/')[2] - if (FORCE_0_POPULARITY_PRODUCTS.has(product)) { - popularity = 0.0 - } - } - newRecord.popularity = popularity - - if (!noMarkers) process.stdout.write(recordMarker) - records.push(newRecord) - }) - .on('error', (err) => { - // Track the failure - const url = (err as unknown as { url?: string }).url - const relativePath = (err as unknown as { relativePath?: string }).relativePath - - // Check for HTTPError by name since it may come from a different module - if ( - (err instanceof HTTPError || err?.name === 'HTTPError') && - (err as unknown as HTTPError).response - ) { - const httpErr = err as unknown as HTTPError - failedPages.push({ - url: httpErr.request?.requestUrl?.pathname || url, - relativePath, - error: err.message, - errorType: `HTTP ${httpErr.response?.statusCode || 'Error'}`, - }) - - if (!noMarkers) process.stdout.write(chalk.red('✗')) - } else if (err instanceof Error) { - // Enhanced error handling for timeout and network errors - const errorType = (err.cause as unknown as { code?: string })?.code || err.name - const isTimeout = - errorType === 'UND_ERR_HEADERS_TIMEOUT' || - errorType === 'UND_ERR_CONNECT_TIMEOUT' || - err.message.includes('timed out') - - failedPages.push({ - url, - relativePath, - error: err.message, - errorType: isTimeout ? 'Timeout' : errorType || 'Unknown Error', - }) - - if (!noMarkers) process.stdout.write(chalk.red('✗')) - } else { - failedPages.push({ - url, - relativePath, - error: String(err), - errorType: 'Unknown Error', - }) - - if (!noMarkers) process.stdout.write(chalk.red('✗')) - } - }) - - // Wait for 'done' event but ignore 'error' events (they're handled by the error listener above) - await eventToPromise(waiter, 'done', { ignoreErrors: true }) - console.log('\nrecords in index: ', records.length) - - // Report failed pages if any - if (failedPages.length > 0) { - const failureCount = failedPages.length - const header = chalk.bold.red(`${failureCount} page(s) failed to scrape\n\n`) - - const failureList = failedPages - .slice(0, 10) // Show first 10 failures - .map((failure, idx) => { - const number = chalk.gray(`${idx + 1}. `) - const errorType = chalk.yellow(failure.errorType) - const pathLine = failure.relativePath - ? `\n${chalk.cyan(' Path: ')}${failure.relativePath}` - : '' - const urlLine = failure.url ? `\n${chalk.cyan(' URL: ')}${failure.url}` : '' - const errorLine = `\n${chalk.gray(` Error: ${failure.error}`)}` - - return `${number}${errorType}${pathLine}${urlLine}${errorLine}` - }) - .join('\n\n') - - const remaining = - failureCount > 10 ? `\n\n${chalk.gray(`... and ${failureCount - 10} more`)}` : '' - - const boxContent = header + failureList + remaining - const box = boxen(boxContent, { - title: chalk.red('⚠ Failed Pages'), - padding: 1, - borderColor: 'yellow', - }) - - console.log(`\n${box}\n`) - - // Log suggestion - console.log( - chalk.yellow( - `💡 Tip: These failures won't stop the scraping process. The script will continue with the remaining pages.`, - ), - ) - - if (failedPages.some((f) => f.errorType === 'Timeout')) { - console.log( - chalk.gray( - ` For timeout errors, try: export BUILD_RECORDS_MAX_CONCURRENT=50 (currently ${MAX_CONCURRENT})`, - ), - ) - } - } - - return { - records, - failedPages, - } -} diff --git a/src/search/scripts/scrape/lib/domwaiter.ts b/src/search/scripts/scrape/lib/domwaiter.ts deleted file mode 100644 index 70e1251f6fe0..000000000000 --- a/src/search/scripts/scrape/lib/domwaiter.ts +++ /dev/null @@ -1,167 +0,0 @@ -import { EventEmitter } from 'events' -import Bottleneck from 'bottleneck' -import { fetchWithRetry } from '@/frame/lib/fetch-utils' -import cheerio from 'cheerio' - -import type { Permalink } from '@/search/scripts/scrape/types' - -// Custom error class to match got's HTTPError interface -class HTTPError extends Error { - response: { ok: boolean; statusCode?: number } - request: { requestUrl?: { pathname?: string } } - - constructor( - message: string, - response: { ok: boolean; statusCode?: number }, - request: { requestUrl?: { pathname?: string } }, - ) { - super(message) - this.name = 'HTTPError' - this.response = response - this.request = request - } -} - -// Type aliases for error objects with additional URL information -type HTTPErrorWithUrl = HTTPError & { url?: string; relativePath?: string } -type ErrorWithUrl = Error & { url?: string; relativePath?: string } - -interface DomWaiterOptions { - parseDOM?: boolean - json?: boolean - maxConcurrent?: number - minTime?: number -} - -export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {}): EventEmitter { - const emitter = new EventEmitter() - - // Add a default no-op error handler to prevent EventEmitter from throwing - // when errors are emitted before the caller attaches their error handler - // This will be overridden/supplemented by the caller's error handler - const defaultErrorHandler = () => { - // No-op: prevents EventEmitter from throwing - // External handlers will still receive the error - } - emitter.on('error', defaultErrorHandler) - - const defaults = { - parseDOM: true, - json: false, - maxConcurrent: 5, - minTime: 500, - } - opts = Object.assign(defaults, opts) - - const limiter = new Bottleneck(opts) - - for (const page of pages) { - async function schedulePage() { - try { - await limiter.schedule(() => getPage(page, emitter, opts)) - } catch (err) { - // Catch any unhandled promise rejections - emitter.emit('error', err) - } - } - - schedulePage() - } - - limiter.on('idle', () => { - emitter.emit('done') - }) - - limiter.on('error', (err) => { - emitter.emit('error', err) - }) - - return emitter -} - -async function getPage(page: Permalink, emitter: EventEmitter, opts: DomWaiterOptions) { - // Wrap everything in a try-catch to ensure no errors escape - try { - emitter.emit('beforePageLoad', page) - - if (opts.json) { - try { - const response = await fetchWithRetry(page.url!, undefined, { - retries: 3, - throwHttpErrors: false, - timeout: 60000, - }) - if (!response.ok) { - const httpError = new HTTPError( - `HTTP ${response.status}: ${response.statusText}`, - { ok: response.ok, statusCode: response.status }, - { requestUrl: { pathname: page.url } }, - ) - // Add URL and path info directly to the HTTPError - ;(httpError as HTTPErrorWithUrl).url = page.url - ;(httpError as HTTPErrorWithUrl).relativePath = page.relativePath - // Emit error instead of throwing - emitter.emit('error', httpError) - return // Exit early, don't continue processing - } - const json = await response.json() - const pageCopy = Object.assign({}, page, { json }) - emitter.emit('page', pageCopy) - } catch (err) { - // Enhance error with URL information - if (err instanceof Error && page.url) { - const enhancedError = new Error(err.message, { cause: err.cause }) - enhancedError.name = err.name - enhancedError.stack = err.stack - ;(enhancedError as ErrorWithUrl).url = page.url - ;(enhancedError as ErrorWithUrl).relativePath = page.relativePath - emitter.emit('error', enhancedError) - } else { - emitter.emit('error', err) - } - } - } else { - try { - const response = await fetchWithRetry(page.url!, undefined, { - retries: 3, - throwHttpErrors: false, - timeout: 60000, - }) - if (!response.ok) { - const httpError = new HTTPError( - `HTTP ${response.status}: ${response.statusText}`, - { ok: response.ok, statusCode: response.status }, - { requestUrl: { pathname: page.url } }, - ) - // Add URL and path info directly to the HTTPError - ;(httpError as HTTPErrorWithUrl).url = page.url - ;(httpError as HTTPErrorWithUrl).relativePath = page.relativePath - // Emit error instead of throwing - emitter.emit('error', httpError) - return // Exit early, don't continue processing - } - const body = await response.text() - const pageCopy = Object.assign({}, page, { body }) - if (opts.parseDOM) - (pageCopy as Permalink & { $?: ReturnType }).$ = cheerio.load(body) - emitter.emit('page', pageCopy) - } catch (err) { - // Enhance error with URL information - if (err instanceof Error && page.url) { - const enhancedError = new Error(err.message, { cause: err.cause }) - enhancedError.name = err.name - enhancedError.stack = err.stack - ;(enhancedError as ErrorWithUrl).url = page.url - ;(enhancedError as ErrorWithUrl).relativePath = page.relativePath - emitter.emit('error', enhancedError) - } else { - emitter.emit('error', err) - } - } - } - } catch (err) { - // Ultimate catch-all to ensure nothing escapes - console.error('Unexpected error in getPage:', err) - emitter.emit('error', err) - } -} diff --git a/src/search/scripts/scrape/lib/parse-page-sections-into-records.ts b/src/search/scripts/scrape/lib/parse-page-sections-into-records.ts deleted file mode 100644 index c862ad5d2464..000000000000 --- a/src/search/scripts/scrape/lib/parse-page-sections-into-records.ts +++ /dev/null @@ -1,95 +0,0 @@ -import { render } from 'cheerio-to-text' - -import type { Record } from '@/search/scripts/scrape/types' - -// This module takes cheerio page object and divides it into sections -// using H1,H2 heading elements as section delimiters. The text -// that follows each heading becomes the content of the search record. - -const ignoredHeadingSlugs = ['in-this-article', 'further-reading', 'prerequisites'] - -export default function parsePageSectionsIntoRecords(page: any): Record { - const { href, $ } = page - const title = $('h1').first().text().trim() - const breadcrumbsArray = $('[data-search=breadcrumbs] nav.breadcrumbs a') - .map((i: number, el: any) => { - return $(el).text().trim().replace('/', '').replace(/\s+/g, ' ') - }) - .get() - - // Like in printing from DOM, some elements should not be included in - // the records for search. This might be navigational elements of the - // page that don't make much sense to find in a site search. - $('[data-search=hide]').remove() - - // Only slice off the last one if the length of the array is greater than 1 - // On an article page, we the breadcrumbs array will be something - // like: - // - // ['Product short title', 'Subcategory', 'Article title'] - // - // But on a product landing page, it'll just be: - // - // ['Product short title'] - // - // So here, if we skip the last one we get nothing for the breadcrumb. - const breadcrumbs = - breadcrumbsArray - .slice(0, breadcrumbsArray.length > 1 ? -1 : breadcrumbsArray.length) - .join(' / ') || '' - - const toplevel = breadcrumbsArray[0] || '' - const objectID = href - - const rootSelector = '[data-search=article-body]' - const $root = $(rootSelector) - if ($root.length === 0) { - console.warn(`${href} has no '${rootSelector}'`) - } else if ($root.length > 1) { - console.warn(`${href} has more than one '${rootSelector}' (${$root.length})`) - } - - const $sections = $('h2', $root) - .filter('[id]') - .filter((i: number, el: any) => { - return !ignoredHeadingSlugs.includes($(el).attr('id')) - }) - - const headings = $sections - .map((i: number, el: any) => $(el).text()) - .get() - .join('\n') - .trim() - - const intro = $('[data-search=lead] p').text().trim() - - let body = '' - // Typical example pages with no `$root` are: - // https://docs.github.com/en/code-security/guides - // - // We need to avoid these because if you use `getAllText()` on these - // pages, it will extract *everything* from the page, which will - // include the side bar and footer. - // Note: We're not adding custom extraction for guide pages as they are - // being phased out and don't warrant the effort. - if ($root.length > 0) { - body = render($root) - } - - if (!body && !intro) { - console.warn(`${objectID} has no body and no intro.`) - } - - const content = - intro && !body.includes(intro.trim()) ? `${intro.trim()}\n${body.trim()}`.trim() : body.trim() - - return { - objectID, - breadcrumbs, - title, - headings, - content, - intro, - toplevel, - } -} diff --git a/src/search/scripts/scrape/lib/scrape-into-index-json.ts b/src/search/scripts/scrape/lib/scrape-into-index-json.ts index 1c764d74a563..b7d50b21423e 100644 --- a/src/search/scripts/scrape/lib/scrape-into-index-json.ts +++ b/src/search/scripts/scrape/lib/scrape-into-index-json.ts @@ -1,7 +1,7 @@ import chalk from 'chalk' import languages from '@/languages/lib/languages-server' -import buildRecords from '@/search/scripts/scrape/lib/build-records' +import buildRecords from '@/search/scripts/scrape/lib/build-records-from-api' import findIndexablePages from '@/search/scripts/scrape/lib/find-indexable-pages' import { writeIndexRecords } from '@/search/scripts/scrape/lib/search-index-records' import { getElasticSearchIndex } from '@/search/lib/elasticsearch-indexes' diff --git a/src/search/scripts/scrape/types.ts b/src/search/scripts/scrape/types.ts index 20db4d78b968..96169a695dbd 100644 --- a/src/search/scripts/scrape/types.ts +++ b/src/search/scripts/scrape/types.ts @@ -68,3 +68,10 @@ export interface Redirects { export interface PopularPages { [key: string]: number } + +export interface FailedPage { + url?: string + relativePath?: string + error: string + errorType: string +} diff --git a/src/search/tests/build-records-from-api.ts b/src/search/tests/build-records-from-api.ts new file mode 100644 index 000000000000..b8b333f327df --- /dev/null +++ b/src/search/tests/build-records-from-api.ts @@ -0,0 +1,641 @@ +import { describe, expect, test, vi, beforeEach, afterEach } from 'vitest' + +import { + extractHeadingsFromMarkdown, + extractFromMarkdown, + markdownToPlainText, + articleApiResponseToRecord, + fetchArticleAsRecord, + type ArticleApiResponse, +} from '@/search/scripts/scrape/lib/build-records-from-api' + +import { fetchWithRetry } from '@/frame/lib/fetch-utils' + +vi.mock('@/frame/lib/fetch-utils', () => ({ + fetchWithRetry: vi.fn(), +})) + +const mockFetchWithRetry = vi.mocked(fetchWithRetry) + +describe('extractHeadingsFromMarkdown', () => { + test('extracts h2 headings', () => { + const markdown = `# Title + +Some intro text. + +## First Section + +Content here. + +## Second Section + +More content. + +### Subsection + +This should be ignored (h3). +` + const headings = extractHeadingsFromMarkdown(markdown) + expect(headings).toBe('First Section\nSecond Section') + }) + + test('filters out navigational headings', () => { + const markdown = `# Title + +## In this article + +Navigation links. + +## Main Content + +The actual content. + +## Further reading + +More links. + +## Prerequisites + +Setup steps. +` + const headings = extractHeadingsFromMarkdown(markdown) + expect(headings).toBe('Main Content') + }) + + test('handles markdown formatting in headings', () => { + const markdown = `# Title + +## Using \`code\` in headings + +## A [link](https://example.com) heading + +## **Bold** heading +` + const headings = extractHeadingsFromMarkdown(markdown) + // Verify complete heading text with formatting stripped + expect(headings).toContain('Using code in headings') + expect(headings).toContain('A link heading') + expect(headings).toContain('Bold heading') + // Should not contain markdown syntax + expect(headings).not.toContain('`') + expect(headings).not.toContain('**') + expect(headings).not.toContain('](') + }) + + test('returns empty string for no h2 headings', () => { + const markdown = `# Just a title + +Some content without sections. +` + const headings = extractHeadingsFromMarkdown(markdown) + expect(headings).toBe('') + }) + + test('filters out Japanese navigational headings', () => { + const markdown = `# タイトル + +## この記事の内容 + +ナビゲーション。 + +## メインコンテンツ + +実際の内容。 + +## 参考資料 + +リンク。 + +## 前提条件 + +セットアップ。 +` + const headings = extractHeadingsFromMarkdown(markdown) + expect(headings).toBe('メインコンテンツ') + }) + + test('filters out non-English navigational headings across languages', () => { + // Chinese + expect(extractHeadingsFromMarkdown('## 本文内容\n\n## 实际内容')).toBe('实际内容') + expect(extractHeadingsFromMarkdown('## 延伸阅读\n\n## 实际内容')).toBe('实际内容') + + // Korean + expect(extractHeadingsFromMarkdown('## 이 문서의 내용\n\n## 실제 내용')).toBe('실제 내용') + expect(extractHeadingsFromMarkdown('## 추가 참고 자료\n\n## 실제 내용')).toBe('실제 내용') + + // Spanish + expect(extractHeadingsFromMarkdown('## En este artículo\n\n## Contenido real')).toBe( + 'Contenido real', + ) + expect(extractHeadingsFromMarkdown('## Información adicional\n\n## Contenido real')).toBe( + 'Contenido real', + ) + + // French + expect(extractHeadingsFromMarkdown('## Dans cet article\n\n## Contenu réel')).toBe( + 'Contenu réel', + ) + expect(extractHeadingsFromMarkdown('## Prérequis\n\n## Contenu réel')).toBe('Contenu réel') + + // German + expect(extractHeadingsFromMarkdown('## Voraussetzungen\n\n## Echter Inhalt')).toBe( + 'Echter Inhalt', + ) + }) +}) + +describe('markdownToPlainText', () => { + test('converts markdown to plain text', () => { + const markdown = `# Title + +This is **bold** and *italic* text. + +- List item 1 +- List item 2 + +[A link](https://example.com) +` + const text = markdownToPlainText(markdown) + expect(text).toContain('Title') + expect(text).toContain('bold') + expect(text).toContain('italic') + expect(text).toContain('List item 1') + expect(text).toContain('A link') + // Should not contain markdown syntax + expect(text).not.toContain('**') + expect(text).not.toContain('](') + }) + + test('includes fenced code block content', () => { + const markdown = `Some text. + +\`\`\`javascript +const x = 1; +\`\`\` + +More text. +` + const text = markdownToPlainText(markdown) + expect(text).toContain('Some text') + expect(text).toContain('More text') + expect(text).toContain('const x = 1') + }) + + test('preserves inline code', () => { + const markdown = 'Use the `git commit` command to save changes.' + const text = markdownToPlainText(markdown) + expect(text).toContain('git commit') + expect(text).toContain('Use the') + }) + + test('inserts whitespace between list items', () => { + const markdown = `1. First item ends with SSH. + +2. Make a request using the CLI. +` + const text = markdownToPlainText(markdown) + // "SSH." and "Make" must not merge into "SSH.Make" + expect(text).not.toMatch(/SSH\.Make/) + expect(text).toMatch(/SSH\.\n/) + expect(text).toContain('Make a request') + }) + + test('inserts whitespace between nested block elements', () => { + const markdown = `> First paragraph in blockquote. +> +> Second paragraph in blockquote. +` + const text = markdownToPlainText(markdown) + // Paragraphs within a blockquote should be separated + expect(text).not.toMatch(/blockquote\.Second/) + expect(text).toContain('First paragraph in blockquote.') + expect(text).toContain('Second paragraph in blockquote.') + }) + + test('handles GFM tables cleanly', () => { + const markdown = `Some intro. + +| Column A | Column B | +| --- | --- | +| Cell 1 | Cell 2 | +| Cell 3 | Cell 4 | + +More text. +` + const text = markdownToPlainText(markdown) + expect(text).toContain('Column A') + expect(text).toContain('Cell 1') + expect(text).toContain('More text') + // Should not contain raw GFM table syntax artifacts + expect(text).not.toContain('| ---') + expect(text).not.toContain('---') + }) +}) + +describe('extractFromMarkdown', () => { + test('returns both headings and content in a single pass', () => { + const markdown = `# Title + +## Section One + +Some content. + +## Further reading + +Links here. + +\`\`\`json +{ "key": "value" } +\`\`\` + +## Section Two + +More content. +` + const result = extractFromMarkdown(markdown) + + // Headings should exclude "Further reading" + expect(result.headings).toBe('Section One\nSection Two') + + // Content should include fenced code block text + expect(result.content).toContain('Some content') + expect(result.content).toContain('More content') + expect(result.content).toContain('"key"') + }) + + test('produces same results as separate wrapper calls', () => { + const markdown = `# Test + +## Heading One + +Body text here. + +## Prerequisites + +Setup info. +` + const combined = extractFromMarkdown(markdown) + const headingsOnly = extractHeadingsFromMarkdown(markdown) + const contentOnly = markdownToPlainText(markdown) + + expect(combined.headings).toBe(headingsOnly) + expect(combined.content).toBe(contentOnly) + }) +}) + +describe('articleApiResponseToRecord', () => { + test('converts API response to search record', () => { + const response: ArticleApiResponse = { + meta: { + title: 'About GitHub', + intro: 'Learn about GitHub.', + product: 'Get started', + breadcrumbs: [ + { href: '/en/get-started', title: 'Get started' }, + { href: '/en/get-started/overview', title: 'Overview' }, + { href: '/en/get-started/overview/about-github', title: 'About GitHub' }, + ], + }, + body: `# About GitHub + +Learn about GitHub. + +## What is GitHub? + +GitHub is a platform for collaboration. + +## Getting started + +Here's how to begin. +`, + } + + const record = articleApiResponseToRecord('/en/get-started/overview/about-github', response) + + expect(record.objectID).toBe('/en/get-started/overview/about-github') + expect(record.title).toBe('About GitHub') + expect(record.intro).toBe('Learn about GitHub.') + expect(record.breadcrumbs).toBe('Get started / Overview') + expect(record.toplevel).toBe('Get started') + expect(record.headings).toBe('What is GitHub?\nGetting started') + expect(record.content).toContain('GitHub is a platform') + }) + + test('handles missing breadcrumbs (archived pages)', () => { + const response: ArticleApiResponse = { + meta: { + title: 'Archived Page', + intro: 'This is archived.', + product: 'Old product', + // No breadcrumbs - simulating archived page + }, + body: '# Archived Page\n\nContent here.', + } + + const record = articleApiResponseToRecord('/en/archived/page', response) + + expect(record.breadcrumbs).toBe('') + expect(record.toplevel).toBe('') + expect(record.title).toBe('Archived Page') + }) + + test('handles single breadcrumb (product landing page)', () => { + const response: ArticleApiResponse = { + meta: { + title: 'Get started', + intro: 'Welcome to GitHub.', + product: 'Get started', + breadcrumbs: [{ href: '/en/get-started', title: 'Get started' }], + }, + body: '# Get started\n\nWelcome.', + } + + const record = articleApiResponseToRecord('/en/get-started', response) + + // For single breadcrumb, don't slice it off + expect(record.breadcrumbs).toBe('Get started') + expect(record.toplevel).toBe('Get started') + }) + + test('prepends intro to content if not already present', () => { + const response: ArticleApiResponse = { + meta: { + title: 'Test', + intro: 'Unique intro text.', + product: 'Test', + breadcrumbs: [], + }, + body: '# Test\n\nDifferent body content.', + } + + const record = articleApiResponseToRecord('/en/test', response) + + expect(record.content).toMatch(/^Unique intro text\./) + expect(record.content).toContain('Different body content') + }) + + test('does not duplicate intro if already in body', () => { + const response: ArticleApiResponse = { + meta: { + title: 'Test', + intro: 'Same intro.', + product: 'Test', + breadcrumbs: [], + }, + body: '# Test\n\nSame intro.\n\nMore content.', + } + + const record = articleApiResponseToRecord('/en/test', response) + + // Intro should appear only once + const introCount = (record.content.match(/Same intro/g) || []).length + expect(introCount).toBe(1) + }) + + test('includes fenced code block content for search', () => { + const response: ArticleApiResponse = { + meta: { + title: 'REST API', + intro: 'API reference.', + product: 'REST', + breadcrumbs: [], + }, + body: `# REST API + +## Endpoints + +Use the endpoint below. + +\`\`\`json +{ + "ssh_url": "ssh://git@github.com/owner/repo.git", + "properties": { "name": "string" } +} +\`\`\` + +## Parameters + +The \`name\` parameter is required. +`, + } + + const record = articleApiResponseToRecord('/en/rest/api', response) + + expect(record.content).toContain('Use the endpoint below') + expect(record.content).toContain('parameter is required') + // Fenced code block content should be included for search + expect(record.content).toContain('ssh_url') + expect(record.content).toContain('ssh://git@github.com') + // Inline code content should also be preserved + expect(record.content).toContain('name') + }) +}) + +describe('fetchArticleAsRecord', () => { + beforeEach(() => { + vi.restoreAllMocks() + }) + + afterEach(() => { + vi.restoreAllMocks() + }) + + test('returns record on successful API response', async () => { + const mockResponse: ArticleApiResponse = { + meta: { + title: 'Test Article', + intro: 'Test intro.', + product: 'Test Product', + breadcrumbs: [{ href: '/en/test', title: 'Test' }], + }, + body: '# Test Article\n\nTest content.', + } + + mockFetchWithRetry.mockResolvedValue({ + ok: true, + json: () => Promise.resolve(mockResponse), + } as Response) + + const result = await fetchArticleAsRecord('/en/test/article', 'http://localhost:4002') + + expect(result.record).not.toBeNull() + expect(result.failure).toBeNull() + expect(result.record?.title).toBe('Test Article') + expect(result.record?.objectID).toBe('/en/test/article') + }) + + test('calls fetchWithRetry with correct options', async () => { + const mockResponse: ArticleApiResponse = { + meta: { + title: 'Test', + intro: 'Intro', + product: 'Product', + breadcrumbs: [], + }, + body: '# Test', + } + + mockFetchWithRetry.mockResolvedValue({ + ok: true, + json: () => Promise.resolve(mockResponse), + } as Response) + + await fetchArticleAsRecord('/en/test', 'http://localhost:4002') + + expect(mockFetchWithRetry).toHaveBeenCalledWith( + 'http://localhost:4002/api/article?pathname=%2Fen%2Ftest', + undefined, + { + retries: 3, + throwHttpErrors: false, + timeout: 60000, + }, + ) + }) + + test('returns failure for HTTP 404', async () => { + mockFetchWithRetry.mockResolvedValue({ + ok: false, + status: 404, + statusText: 'Not Found', + json: () => Promise.reject(new Error('no body')), + } as unknown as Response) + + const result = await fetchArticleAsRecord('/en/nonexistent', 'http://localhost:4002') + + expect(result.record).toBeNull() + expect(result.failure).not.toBeNull() + expect(result.failure?.errorType).toBe('HTTP 404') + expect(result.failure?.error).toContain('404') + }) + + test('returns failure for HTTP 500', async () => { + mockFetchWithRetry.mockResolvedValue({ + ok: false, + status: 500, + statusText: 'Internal Server Error', + json: () => Promise.reject(new Error('no body')), + } as unknown as Response) + + const result = await fetchArticleAsRecord('/en/broken', 'http://localhost:4002') + + expect(result.record).toBeNull() + expect(result.failure).not.toBeNull() + expect(result.failure?.errorType).toBe('HTTP 500') + }) + + test('parses 403 response body for error message', async () => { + mockFetchWithRetry.mockResolvedValue({ + ok: false, + status: 403, + statusText: 'Forbidden', + json: () => Promise.resolve({ error: 'Page is archived and not available' }), + } as unknown as Response) + + const result = await fetchArticleAsRecord('/en/archived/page', 'http://localhost:4002') + + expect(result.record).toBeNull() + expect(result.failure).not.toBeNull() + expect(result.failure?.errorType).toBe('API Error') + expect(result.failure?.error).toBe('Page is archived and not available') + }) + + test('falls back to HTTP status when 403 body is not JSON', async () => { + mockFetchWithRetry.mockResolvedValue({ + ok: false, + status: 403, + statusText: 'Forbidden', + json: () => Promise.reject(new Error('Invalid JSON')), + } as unknown as Response) + + const result = await fetchArticleAsRecord('/en/forbidden', 'http://localhost:4002') + + expect(result.record).toBeNull() + expect(result.failure).not.toBeNull() + expect(result.failure?.errorType).toBe('HTTP 403') + expect(result.failure?.error).toBe('HTTP 403: Forbidden') + }) + + test('returns failure for API error response (archived pages)', async () => { + mockFetchWithRetry.mockResolvedValue({ + ok: true, + json: () => Promise.resolve({ error: 'This page is archived' }), + } as Response) + + const result = await fetchArticleAsRecord('/en/archived/page', 'http://localhost:4002') + + expect(result.record).toBeNull() + expect(result.failure).not.toBeNull() + expect(result.failure?.errorType).toBe('API Error') + expect(result.failure?.error).toBe('This page is archived') + }) + + test('returns failure with Timeout errorType for AbortError', async () => { + const abortError = new Error('The operation was aborted') + abortError.name = 'AbortError' + + mockFetchWithRetry.mockRejectedValue(abortError) + + const result = await fetchArticleAsRecord('/en/slow/page', 'http://localhost:4002') + + expect(result.record).toBeNull() + expect(result.failure).not.toBeNull() + expect(result.failure?.errorType).toBe('Timeout') + }) + + test('returns failure with Timeout errorType for ETIMEDOUT', async () => { + const timeoutError = new Error('Connection timed out') as Error & { code: string } + timeoutError.code = 'ETIMEDOUT' + + mockFetchWithRetry.mockRejectedValue(timeoutError) + + const result = await fetchArticleAsRecord('/en/slow/page', 'http://localhost:4002') + + expect(result.record).toBeNull() + expect(result.failure).not.toBeNull() + expect(result.failure?.errorType).toBe('Timeout') + }) + + test('returns failure with Network Error for other errors', async () => { + mockFetchWithRetry.mockRejectedValue(new Error('Connection refused')) + + const result = await fetchArticleAsRecord('/en/unreachable', 'http://localhost:4002') + + expect(result.record).toBeNull() + expect(result.failure).not.toBeNull() + expect(result.failure?.errorType).toBe('Network Error') + expect(result.failure?.error).toBe('Connection refused') + }) + + test('FetchResult structure matches expected shape', async () => { + const mockResponse: ArticleApiResponse = { + meta: { + title: 'Test', + intro: 'Intro', + product: 'Product', + breadcrumbs: [], + }, + body: '# Test', + } + + mockFetchWithRetry.mockResolvedValue({ + ok: true, + json: () => Promise.resolve(mockResponse), + } as Response) + + const result = await fetchArticleAsRecord('/en/test', 'http://localhost:4002') + + // Verify the shape of FetchResult + expect(result).toHaveProperty('record') + expect(result).toHaveProperty('failure') + + // When successful, record should have all expected fields + expect(result.record).toHaveProperty('objectID') + expect(result.record).toHaveProperty('title') + expect(result.record).toHaveProperty('intro') + expect(result.record).toHaveProperty('content') + expect(result.record).toHaveProperty('headings') + expect(result.record).toHaveProperty('breadcrumbs') + expect(result.record).toHaveProperty('toplevel') + }) +}) diff --git a/src/search/tests/parse-page-sections-into-records.ts b/src/search/tests/parse-page-sections-into-records.ts deleted file mode 100644 index 0e2e0741fc9d..000000000000 --- a/src/search/tests/parse-page-sections-into-records.ts +++ /dev/null @@ -1,131 +0,0 @@ -import { fileURLToPath } from 'url' -import path from 'path' -import fs from 'fs/promises' - -import cheerio from 'cheerio' -import { describe, expect, test } from 'vitest' - -import parsePageSectionsIntoRecords from '@/search/scripts/scrape/lib/parse-page-sections-into-records' -import type { Record } from '@/search/scripts/scrape/types' - -const __dirname = path.dirname(fileURLToPath(import.meta.url)) - -// Define the shape of fixtures with explicit keys and string values -const fixtures: { - pageWithSections: string - pageWithoutSections: string - pageWithoutBody: string - pageMultipleH1s: string - pageHeadingParagraphNoWhitespace: string -} = { - pageWithSections: await fs.readFile( - path.join(__dirname, 'fixtures/page-with-sections.html'), - 'utf8', - ), - pageWithoutSections: await fs.readFile( - path.join(__dirname, 'fixtures/page-without-sections.html'), - 'utf8', - ), - pageWithoutBody: await fs.readFile( - path.join(__dirname, 'fixtures/page-without-body.html'), - 'utf8', - ), - pageMultipleH1s: await fs.readFile( - path.join(__dirname, 'fixtures/page-with-multiple-h1s.html'), - 'utf8', - ), - pageHeadingParagraphNoWhitespace: await fs.readFile( - path.join(__dirname, 'fixtures/page-with-heading-and-paragraph-no-whitespace.html'), - 'utf8', - ), -} - -describe('search parsePageSectionsIntoRecords module', () => { - test('works for pages with sections', () => { - const html: string = fixtures.pageWithSections - const $ = cheerio.load(html) - const href: string = '/example/href' - const record: Record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) - const expected: Record = { - objectID: '/example/href', - breadcrumbs: 'GitHub Actions / actions learning path', - title: 'I am the page title', - headings: 'First heading\nSecond heading\nTable heading', - content: - 'This is an introduction to the article.\n' + - "In this article\nThis won't be ignored.\nFirst heading\n" + - "Here's a paragraph.\nAnd another.\nSecond heading\n" + - "Here's a paragraph in the second section.\nAnd another.\n" + - 'Table heading\nPeter\nHuman\n' + - 'Bullet\nPoint\nNumbered\nList\n' + - "Further reading\nThis won't be ignored.", - intro: 'This is an introduction to the article.', - toplevel: 'GitHub Actions', - } - - expect(record).toEqual(expected) - }) - - test('works for pages without sections', () => { - const html: string = fixtures.pageWithoutSections - const $ = cheerio.load(html) - const href: string = '/example/href' - const record: Record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) - const expected: Record = { - objectID: '/example/href', - breadcrumbs: 'Education / subcategory', - title: 'A page without sections', - headings: '', - content: 'This is an introduction to the article.\nFirst paragraph.\nSecond paragraph.', - intro: 'This is an introduction to the article.', - toplevel: 'Education', - } - - expect(record).toEqual(expected) - }) - - test('works for pages without content', () => { - const html: string = fixtures.pageWithoutBody - const $ = cheerio.load(html) - const href: string = '/example/href' - const record: Record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) - const expected: Record = { - objectID: '/example/href', - breadcrumbs: 'Education / subcategory', - title: 'A page without body', - headings: '', - content: 'This is an introduction to the article.', - intro: 'This is an introduction to the article.', - toplevel: 'Education', - } - - expect(record).toEqual(expected) - }) - - test('only picks up the first h1 for the title', () => { - const html: string = fixtures.pageMultipleH1s - const $ = cheerio.load(html) - const href: string = '/example/href' - const record: Record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) - - expect(record.title).toEqual('I am the page title') - }) - - test("content doesn't lump headings with paragraphs together", () => { - const html: string = fixtures.pageHeadingParagraphNoWhitespace - const $ = cheerio.load(html) - const href: string = '/example/href' - const record: Record = parsePageSectionsIntoRecords({ href, $, languageCode: 'en' }) - - // Ensure the heading appears only once - const headingMatches = record.content.match(/Changing your primary email address/g) - expect(headingMatches).not.toBeNull() - expect(headingMatches!.length).toBe(1) - - // Ensure there's no concatenation without whitespace - expect(record.content.includes('email addressYou can set')).toBeFalsy() - - // Ensure inline elements remain intact - expect(record.content).toMatch(/Paragraph\./) - }) -})