Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
"apify-node-curl-impersonate": "^1.0.15",
"basic-auth-parser": "^0.0.2",
"body-parser": "^2.0.0",
"camoufox-js": "^0.8.0",
"camoufox-js": "^0.9.0",
"commitlint": "^20.0.0",
"cross-env": "^10.0.0",
"deep-equal": "^2.0.5",
Expand Down
2 changes: 1 addition & 1 deletion packages/basic-crawler/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
"@crawlee/utils": "3.16.0",
"csv-stringify": "^6.2.0",
"fs-extra": "^11.0.0",
"got-scraping": "^4.0.0",
"got-scraping": "^4.2.1",
"ow": "^0.28.1",
"tldts": "^7.0.0",
"tslib": "^2.4.0",
Expand Down
2 changes: 1 addition & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
"@vladfrangu/async_event_emitter": "^2.2.2",
"csv-stringify": "^6.2.0",
"fs-extra": "^11.0.0",
"got-scraping": "^4.0.0",
"got-scraping": "^4.2.1",
"json5": "^2.2.3",
"minimatch": "^9.0.0",
"ow": "^0.28.1",
Expand Down
2 changes: 1 addition & 1 deletion packages/http-crawler/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
"@types/content-type": "^1.1.5",
"cheerio": "1.0.0-rc.12",
"content-type": "^1.0.4",
"got-scraping": "^4.0.0",
"got-scraping": "^4.2.1",
"iconv-lite": "^0.7.0",
"mime-types": "^2.1.35",
"ow": "^0.28.1",
Expand Down
2 changes: 1 addition & 1 deletion packages/templates/templates/camoufox-ts/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"type": "module",
"description": "This is an example of a Crawlee project.",
"dependencies": {
"camoufox-js": "^0.8.0",
"camoufox-js": "^0.9.0",
"crawlee": "^3.0.0",
"playwright": "*"
},
Expand Down
2 changes: 1 addition & 1 deletion packages/utils/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
"@types/sax": "^1.2.7",
"cheerio": "1.0.0-rc.12",
"file-type": "^20.0.0",
"got-scraping": "^4.0.3",
"got-scraping": "^4.2.1",
"ow": "^0.28.1",
"robots-parser": "^3.0.1",
"sax": "^1.4.1",
Expand Down
19 changes: 16 additions & 3 deletions packages/utils/src/internals/robots.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,20 @@ export class RobotsTxtFile {
* Determine the location of a robots.txt file for a URL and fetch it.
* @param url the URL to fetch robots.txt for
* @param [proxyUrl] a proxy to be used for fetching the robots.txt file
* @param [options] additional options
* @param [options.signal] an AbortSignal to cancel the request
* @param [options.timeoutMillis] timeout in milliseconds for the request
*/
static async find(url: string, proxyUrl?: string): Promise<RobotsTxtFile> {
static async find(
url: string,
proxyUrl?: string,
options?: { signal?: AbortSignal; timeoutMillis?: number },
): Promise<RobotsTxtFile> {
const robotsTxtFileUrl = new URL(url);
robotsTxtFileUrl.pathname = '/robots.txt';
robotsTxtFileUrl.search = '';

return RobotsTxtFile.load(robotsTxtFileUrl.toString(), proxyUrl);
return RobotsTxtFile.load(robotsTxtFileUrl.toString(), proxyUrl, options);
}

/**
Expand All @@ -55,7 +62,11 @@ export class RobotsTxtFile {
return new RobotsTxtFile(robotsParser(url, content), proxyUrl);
}

protected static async load(url: string, proxyUrl?: string): Promise<RobotsTxtFile> {
protected static async load(
url: string,
proxyUrl?: string,
options?: { signal?: AbortSignal; timeoutMillis?: number },
): Promise<RobotsTxtFile> {
if (!HTTPError) {
HTTPError = (await import('got-scraping')).HTTPError;
}
Expand All @@ -66,6 +77,8 @@ export class RobotsTxtFile {
proxyUrl,
method: 'GET',
responseType: 'text',
signal: options?.signal,
...(options?.timeoutMillis ? { timeout: { request: options.timeoutMillis } } : {}),
});

return new RobotsTxtFile(robotsParser(url.toString(), response.body), proxyUrl);
Expand Down
75 changes: 45 additions & 30 deletions packages/utils/src/internals/sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -454,13 +454,42 @@ export async function* discoverValidSitemaps(
* Proxy URL to be used for network requests.
*/
proxyUrl?: string;
/**
* Timeout in milliseconds for the entire `discoverValidSitemaps` call.
* An `AbortController` is created internally and its signal is passed to every HTTP request,
* so the whole discovery operation is cancelled once the timeout elapses.
* Defaults to `60_000` ms (60 seconds) to prevent indefinite hangs.
*/
timeoutMillis?: number;
/**
* An external `AbortSignal` to cancel the entire discovery operation.
* If both `signal` and `timeout` are provided, the operation is cancelled
* when either the signal is aborted or the timeout elapses (whichever comes first).
*/
signal?: AbortSignal;
/**
* Timeout in milliseconds for each individual HTTP request during discovery.
* Defaults to `20000` ms (20 seconds).
*/
requestTimeoutMillis?: number;
} = {},
): AsyncIterable<string> {
const { proxyUrl } = options;
const { proxyUrl, timeoutMillis = 60_000, signal: externalSignal, requestTimeoutMillis = 20_000 } = options;
const controller = new AbortController();

const timeoutHandle = setTimeout(() => controller.abort(), timeoutMillis);
const onExternalAbort = () => controller.abort();
if (externalSignal) {
if (externalSignal.aborted) {
controller.abort();
} else {
externalSignal.addEventListener('abort', onExternalAbort, { once: true });
}
}

const signal = controller.signal;
const { gotScraping } = await import('got-scraping');
const sitemapUrls = new Set<string>();
// Keep each probe bounded so discovery cannot stall indefinitely on a single request.
const DISCOVERY_REQUEST_TIMEOUT_MILLIS = 20_000;

const addSitemapUrl = (url: string): string | undefined => {
const sizeBefore = sitemapUrls.size;
Expand All @@ -474,33 +503,15 @@ export async function* discoverValidSitemaps(
return undefined;
};

const runWithTimeout = async <T>(
promise: Promise<T>,
timeoutMillis: number,
timeoutMessage: string,
): Promise<T> => {
let timeout: ReturnType<typeof setTimeout> | undefined;
const timeoutPromise = new Promise<never>((_, reject) => {
timeout = setTimeout(() => reject(new Error(timeoutMessage)), timeoutMillis);
});

try {
return await Promise.race([promise, timeoutPromise]);
} finally {
if (timeout !== undefined) {
clearTimeout(timeout);
}
}
};

const urlExists = async (url: string) => {
const response = await gotScraping({
url,
method: 'HEAD',
proxyUrl,
timeout: {
request: DISCOVERY_REQUEST_TIMEOUT_MILLIS,
request: requestTimeoutMillis,
},
signal,
});

return response.statusCode >= 200 && response.statusCode < 400;
Expand All @@ -512,11 +523,10 @@ export async function* discoverValidSitemaps(
}

try {
const robotsFile = await runWithTimeout(
RobotsFile.find(domainUrls[0], proxyUrl),
DISCOVERY_REQUEST_TIMEOUT_MILLIS,
`Fetching robots.txt timed out for ${hostname}`,
);
const robotsFile = await RobotsFile.find(domainUrls[0], proxyUrl, {
timeoutMillis: requestTimeoutMillis,
signal,
});
for (const sitemapUrl of robotsFile.getSitemaps()) {
if (addSitemapUrl(sitemapUrl)) {
yield sitemapUrl;
Expand Down Expand Up @@ -568,7 +578,12 @@ export async function* discoverValidSitemaps(
discoverSitemapsForDomainUrls(hostname, domainUrls),
);

for await (const url of mergeAsyncIterables(...iterables)) {
yield url;
try {
for await (const url of mergeAsyncIterables(...iterables)) {
yield url;
}
} finally {
clearTimeout(timeoutHandle);
externalSignal?.removeEventListener('abort', onExternalAbort);
}
}
42 changes: 42 additions & 0 deletions packages/utils/test/robots.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ describe('RobotsTxtFile', () => {
nock('http://not-exists.com')
.persist()
.get('/robots.txt')
.delay(500)
.reply(
200,
[
Expand Down Expand Up @@ -57,6 +58,47 @@ describe('RobotsTxtFile', () => {
]);
});

it('respects user-set timeout', async () => {
const start = +Date.now();
const robots = RobotsTxtFile.find('http://not-exists.com/robots.txt', undefined, { timeoutMillis: 200 });

await expect(robots).rejects.toThrow(/timeout/i);
const end = +Date.now();

expect(end - start).toBeGreaterThanOrEqual(200);
expect(end - start).toBeLessThanOrEqual(500);
});

it('respects AbortSignal parameter', async () => {
const controller = new AbortController();
setTimeout(() => controller.abort(), 200);

const start = +Date.now();
const robots = RobotsTxtFile.find('http://not-exists.com/robots.txt', undefined, { signal: controller.signal });

await expect(robots).rejects.toThrow(/aborted/i);
const end = +Date.now();

expect(end - start).toBeGreaterThanOrEqual(200);
expect(end - start).toBeLessThanOrEqual(500);
});

it('respects AbortSignal parameter and timeout together', async () => {
const controller = new AbortController();

const start = +Date.now();
const robots = RobotsTxtFile.find('http://not-exists.com/robots.txt', undefined, {
signal: controller.signal,
timeoutMillis: 200,
});

await expect(robots).rejects.toThrow(/timeout/i);
const end = +Date.now();

expect(end - start).toBeGreaterThanOrEqual(200);
expect(end - start).toBeLessThanOrEqual(500);
});

it('parses allow/deny directives from explicitly provided robots.txt contents', async () => {
const contents = `User-agent: *',
Disallow: *deny_all/
Expand Down
86 changes: 86 additions & 0 deletions packages/utils/test/sitemap.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -588,4 +588,90 @@ describe('discoverValidSitemaps', () => {
'http://domain-b.com/sitemap.txt',
]);
});

it('aborts when timeoutMillis elapses', async () => {
nock('http://slow-site.com')
.get('/robots.txt')
.delay(5_000)
.reply(200, 'Sitemap: http://slow-site.com/sitemap.xml');

const start = Date.now();
const urls = [];
for await (const url of discoverValidSitemaps(['http://slow-site.com'], { timeoutMillis: 100 })) {
urls.push(url);
}
const elapsed = Date.now() - start;

expect(urls).toEqual([]);
expect(elapsed).toBeLessThan(2_000);
});

it('aborts when external signal is triggered', async () => {
nock('http://slow-site.com')
.get('/robots.txt')
.delay(5_000)
.reply(200, 'Sitemap: http://slow-site.com/sitemap.xml');

const ac = new AbortController();
setTimeout(() => ac.abort(), 100);

const start = Date.now();
const urls = [];
for await (const url of discoverValidSitemaps(['http://slow-site.com'], {
timeoutMillis: 60_000,
signal: ac.signal,
})) {
urls.push(url);
}
const elapsed = Date.now() - start;

expect(urls).toEqual([]);
expect(elapsed).toBeLessThan(2_000);
});

it('aborts immediately when signal is already aborted', async () => {
nock('http://slow-site.com')
.get('/robots.txt')
.delay(5_000)
.reply(200, 'Sitemap: http://slow-site.com/sitemap.xml');

const ac = new AbortController();
ac.abort();

const start = Date.now();
const urls = [];
for await (const url of discoverValidSitemaps(['http://slow-site.com'], { signal: ac.signal })) {
urls.push(url);
}
const elapsed = Date.now() - start;

expect(urls).toEqual([]);
expect(elapsed).toBeLessThan(1_000);
});

it('requestTimeoutMillis aborts slow robots.txt without killing the whole discovery', async () => {
nock('http://slow-site.com')
.get('/robots.txt')
.delay(5_000)
.reply(200, 'Sitemap: http://slow-site.com/sitemap.xml')
.head('/sitemap.xml')
.reply(200, '')
.head('/sitemap.txt')
.reply(404, '')
.head('/sitemap_index.xml')
.reply(404, '');

const start = Date.now();
const urls = [];
for await (const url of discoverValidSitemaps(['http://slow-site.com'], {
timeoutMillis: 30_000,
requestTimeoutMillis: 100,
})) {
urls.push(url);
}
const elapsed = Date.now() - start;

expect(urls).toEqual(['http://slow-site.com/sitemap.xml']);
expect(elapsed).toBeLessThan(2_000);
});
});
4 changes: 0 additions & 4 deletions website/docusaurus.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -273,10 +273,6 @@ module.exports = {
hideable: true,
},
},
announcementBar: {
id: `apify-1m-challenge`,
content: `<b><a href="https://apify.com/challenge">Apify $1M Challenge 💰</a></b> Earn and win building with Crawlee!`,
},
navbar: {
hideOnScroll: true,
title: 'Crawlee',
Expand Down
6 changes: 3 additions & 3 deletions website/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -12093,9 +12093,9 @@ __metadata:
linkType: hard

"lodash@npm:^4.17.20, lodash@npm:^4.17.21":
version: 4.17.21
resolution: "lodash@npm:4.17.21"
checksum: 10c0/d8cbea072bb08655bb4c989da418994b073a608dffa608b09ac04b43a791b12aeae7cd7ad919aa4c925f33b48490b5cfe6c1f71d827956071dae2e7bb3a6b74c
version: 4.17.23
resolution: "lodash@npm:4.17.23"
checksum: 10c0/1264a90469f5bb95d4739c43eb6277d15b6d9e186df4ac68c3620443160fc669e2f14c11e7d8b2ccf078b81d06147c01a8ccced9aab9f9f63d50dcf8cace6bf6
languageName: node
linkType: hard

Expand Down
Loading
Loading