Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .husky/pre-commit
Original file line number Diff line number Diff line change
@@ -1 +1 @@
yarn format
yarn lint-staged
6 changes: 3 additions & 3 deletions docs/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -261,11 +261,11 @@ __metadata:
linkType: hard

"minimatch@npm:^3.1.1":
version: 3.1.2
resolution: "minimatch@npm:3.1.2"
version: 3.1.3
resolution: "minimatch@npm:3.1.3"
dependencies:
brace-expansion: "npm:^1.1.7"
checksum: 10c0/0262810a8fc2e72cca45d6fd86bd349eee435eb95ac6aa45c9ea2180e7ee875ef44c32b55b5973ceabe95ea12682f6e3725cbb63d7a2d1da3ae1163c8b210311
checksum: 10c0/c1ffce4be47e88df013f66f55176c25a93fdd8ad15735309cf1782f0433a02f363cee298f8763ceaaaf85e70ff7f30dc84a1a8d00a6fb6ca72032e5b51f9b89c
languageName: node
linkType: hard

Expand Down
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@
"typescript-eslint": "^8.28.0",
"vitest": "^4.0.16"
},
"lint-staged": {
"**/*.{js,ts,mjs,mts,cjs,cts,json,css}": "biome format --write --no-errors-on-unmatched"
},
"packageManager": "yarn@4.10.3",
"volta": {
"node": "24.13.0",
Expand Down
56 changes: 47 additions & 9 deletions packages/utils/src/internals/sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,8 @@ export async function* discoverValidSitemaps(
const { proxyUrl } = options;
const { gotScraping } = await import('got-scraping');
const sitemapUrls = new Set<string>();
// Keep each probe bounded so discovery cannot stall indefinitely on a single request.
const DISCOVERY_REQUEST_TIMEOUT_MILLIS = 20_000;

const addSitemapUrl = (url: string): string | undefined => {
const sizeBefore = sitemapUrls.size;
Expand All @@ -472,21 +474,49 @@ export async function* discoverValidSitemaps(
return undefined;
};

const urlExists = (url: string) =>
gotScraping({
proxyUrl,
const runWithTimeout = async <T>(
promise: Promise<T>,
timeoutMillis: number,
timeoutMessage: string,
): Promise<T> => {
let timeout: ReturnType<typeof setTimeout> | undefined;
const timeoutPromise = new Promise<never>((_, reject) => {
timeout = setTimeout(() => reject(new Error(timeoutMessage)), timeoutMillis);
});

try {
return await Promise.race([promise, timeoutPromise]);
} finally {
if (timeout !== undefined) {
clearTimeout(timeout);
}
}
};

const urlExists = async (url: string) => {
const response = await gotScraping({
url,
method: 'HEAD',
}).then((response) => response.statusCode >= 200 && response.statusCode < 400);
proxyUrl,
timeout: {
request: DISCOVERY_REQUEST_TIMEOUT_MILLIS,
},
});

return response.statusCode >= 200 && response.statusCode < 400;
};

const discoverSitemapsForDomainUrls = async function* (hostname: string, domainUrls: string[]) {
if (!hostname) {
return;
}

try {
const robotsFile = await RobotsFile.find(domainUrls[0], proxyUrl);

const robotsFile = await runWithTimeout(
RobotsFile.find(domainUrls[0], proxyUrl),
DISCOVERY_REQUEST_TIMEOUT_MILLIS,
`Fetching robots.txt timed out for ${hostname}`,
);
for (const sitemapUrl of robotsFile.getSitemaps()) {
if (addSitemapUrl(sitemapUrl)) {
yield sitemapUrl;
Expand All @@ -507,10 +537,18 @@ export async function* discoverValidSitemaps(
const possibleSitemapPathnames = ['/sitemap.xml', '/sitemap.txt', '/sitemap_index.xml'];
for (const pathname of possibleSitemapPathnames) {
firstUrl.pathname = pathname;
if (await urlExists(firstUrl.toString())) {
if (addSitemapUrl(firstUrl.toString())) {
yield firstUrl.toString();
const candidateSitemapUrl = firstUrl.toString();

try {
if (await urlExists(candidateSitemapUrl)) {
if (addSitemapUrl(candidateSitemapUrl)) {
yield candidateSitemapUrl;
}
}
} catch (err) {
log.debug(`Failed to check sitemap candidate ${candidateSitemapUrl} for ${hostname}`, {
error: err,
});
}
}
}
Expand Down
24 changes: 12 additions & 12 deletions website/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -5897,14 +5897,14 @@ __metadata:
linkType: hard

"ajv@npm:^6.12.4, ajv@npm:^6.12.5":
version: 6.12.6
resolution: "ajv@npm:6.12.6"
version: 6.14.0
resolution: "ajv@npm:6.14.0"
dependencies:
fast-deep-equal: "npm:^3.1.1"
fast-json-stable-stringify: "npm:^2.0.0"
json-schema-traverse: "npm:^0.4.1"
uri-js: "npm:^4.2.2"
checksum: 10c0/41e23642cbe545889245b9d2a45854ebba51cda6c778ebced9649420d9205f2efb39cb43dbc41e358409223b1ea43303ae4839db682c848b891e4811da1a5a71
checksum: 10c0/a2bc39b0555dc9802c899f86990eb8eed6e366cddbf65be43d5aa7e4f3c4e1a199d5460fd7ca4fb3d864000dbbc049253b72faa83b3b30e641ca52cb29a68c22
languageName: node
linkType: hard

Expand Down Expand Up @@ -6405,9 +6405,9 @@ __metadata:
linkType: hard

"bn.js@npm:^4.0.0, bn.js@npm:^4.1.0, bn.js@npm:^4.11.9":
version: 4.12.2
resolution: "bn.js@npm:4.12.2"
checksum: 10c0/09a249faa416a9a1ce68b5f5ec8bbca87fe54e5dd4ef8b1cc8a4969147b80035592bddcb1e9cc814c3ba79e573503d5c5178664b722b509fb36d93620dba9b57
version: 4.12.3
resolution: "bn.js@npm:4.12.3"
checksum: 10c0/53b6a4db8a583abd2522eacd480fece26fe6c4d8d35d03e5e11e15cb0873a3044eb4e3d1f9fef56f47eb008219e99ba5b620c26f57db49a687c6ab2cf848d50b
languageName: node
linkType: hard

Expand Down Expand Up @@ -12183,8 +12183,8 @@ __metadata:
linkType: hard

"markdown-it@npm:^14.1.0":
version: 14.1.0
resolution: "markdown-it@npm:14.1.0"
version: 14.1.1
resolution: "markdown-it@npm:14.1.1"
dependencies:
argparse: "npm:^2.0.1"
entities: "npm:^4.4.0"
Expand All @@ -12194,7 +12194,7 @@ __metadata:
uc.micro: "npm:^2.1.0"
bin:
markdown-it: bin/markdown-it.mjs
checksum: 10c0/9a6bb444181d2db7016a4173ae56a95a62c84d4cbfb6916a399b11d3e6581bf1cc2e4e1d07a2f022ae72c25f56db90fbe1e529fca16fbf9541659dc53480d4b4
checksum: 10c0/c67f2a4c8069a307c78d8c15104bbcb15a2c6b17f4c904364ca218ec2eccf76a397eba1ea05f5ac5de72c4b67fcf115d422d22df0bfb86a09b663f55b9478d4f
languageName: node
linkType: hard

Expand Down Expand Up @@ -17117,15 +17117,15 @@ __metadata:
linkType: hard

"tar@npm:^7.5.2":
version: 7.5.7
resolution: "tar@npm:7.5.7"
version: 7.5.9
resolution: "tar@npm:7.5.9"
dependencies:
"@isaacs/fs-minipass": "npm:^4.0.0"
chownr: "npm:^3.0.0"
minipass: "npm:^7.1.2"
minizlib: "npm:^3.1.0"
yallist: "npm:^5.0.0"
checksum: 10c0/51f261afc437e1112c3e7919478d6176ea83f7f7727864d8c2cce10f0b03a631d1911644a567348c3063c45abdae39718ba97abb073d22aa3538b9a53ae1e31c
checksum: 10c0/e870beb1b2477135ca2abe86b2d18f7b35d0a4e3a37bbc523d3b8f7adca268dfab543f26528a431d569897f8c53a7cac745cdfbc4411c2f89aeeacc652b81b0a
languageName: node
linkType: hard

Expand Down
Loading
Loading