-
Notifications
You must be signed in to change notification settings - Fork 202
fix(seo): reduce sitemap bloat by filtering versioned docs and low-value pages #2016
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -15,6 +15,53 @@ const sitemapXMLs = [ | |||||
| ], | ||||||
| ]; | ||||||
|
|
||||||
| /** | ||||||
| * URL patterns to exclude from the sitemap. | ||||||
| * | ||||||
| * Why: | ||||||
| * - Versioned doc URLs (e.g. /docs/apisix/3.14/) duplicate the latest | ||||||
| * unversioned paths (e.g. /docs/apisix/) and bloat the sitemap. | ||||||
| * Only the unversioned (latest) URLs should be indexed. | ||||||
| * - /docs/.../next/ pages are for unreleased development docs. | ||||||
| * - /search pages are blocked by robots.txt — keeping them in | ||||||
| * the sitemap sends contradictory signals to crawlers. | ||||||
| * - /blog/tags/ and /blog/page/ are low-value aggregation/pagination | ||||||
| * pages, also blocked by robots.txt. | ||||||
| */ | ||||||
| const excludePatterns = [ | ||||||
| // Versioned docs: /docs/<project>/<version>/ where version is digits.digits | ||||||
| /\/docs\/[\w-]+\/\d+\.\d+\//, | ||||||
| // Development "next" docs | ||||||
| /\/docs\/[\w-]+\/next\//, | ||||||
| // Search pages (blocked by robots.txt) | ||||||
| /\/search\/?$/, | ||||||
| // Blog tag and pagination pages (blocked by robots.txt) | ||||||
| /\/blog\/tags\//, | ||||||
| /\/blog\/page\//, | ||||||
| ]; | ||||||
|
|
||||||
| /** | ||||||
| * Returns true if the URL should be excluded from the sitemap. | ||||||
| */ | ||||||
| function shouldExclude(url) { | ||||||
| return excludePatterns.some((pattern) => pattern.test(url)); | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * Filter out excluded URLs from a sitemap object and return removal count. | ||||||
| */ | ||||||
| function filterSitemapUrls(sitemap) { | ||||||
| const urls = Array.isArray(sitemap.urlset.url) | ||||||
| ? sitemap.urlset.url | ||||||
| : [sitemap.urlset.url]; | ||||||
| const before = urls.length; | ||||||
| sitemap.urlset.url = urls.filter((entry) => { | ||||||
| const loc = entry.loc && entry.loc._text; | ||||||
| return !loc || !shouldExclude(loc); | ||||||
|
||||||
| return !loc || !shouldExclude(loc); | |
| return Boolean(loc) && !shouldExclude(loc); |
Uh oh!
There was an error while loading. Please reload this page.