From b25ec336f0b615b963d92978eea57d67e75d94b4 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Sun, 22 Mar 2026 00:29:57 +0300 Subject: [PATCH 01/24] refactor: extract robots check as a separate function --- apps/worker/src/utils/fetchAsBot.ts | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index 5effabe..81b00e0 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -42,6 +42,16 @@ async function getRobots(input: URL): Promise { return robotsParser(robotsUrl.toString(), robots); } +async function checkRobotsAccess(url: URL) { + const robots = await getRobots(url); + + if (robots && robots.isDisallowed(url.toString(), userAgent)) { + throw new RobotDeniedError( + `${userAgent} is disallowed from ${url.hostname}!`, + ); + } +} + type FetchAsBotInit = Omit< Dispatcher.RequestOptions, "origin" | "path" @@ -68,13 +78,7 @@ export async function fetchAsBot(options: FetchAsBotInit) { } = options; const parsedUrl = url instanceof URL ? url : new URL(url); if (!skipRobotsCheck) { - const robots = await getRobots(parsedUrl); - - if (robots && robots.isDisallowed(url.toString(), userAgent)) { - throw new RobotDeniedError( - `${userAgent} is disallowed from ${parsedUrl.hostname}!`, - ); - } + await checkRobotsAccess(parsedUrl); } console.debug(init.method ?? "GET", parsedUrl.href); @@ -121,13 +125,7 @@ export async function fetchAsBotStream({ }: FetchAsBotInit & { writable: Writable }) { const parsedUrl = url instanceof URL ? url : new URL(url); if (!skipRobotsCheck) { - const robots = await getRobots(parsedUrl); - - if (robots && robots.isDisallowed(url.toString(), userAgent)) { - throw new RobotDeniedError( - `${userAgent} is disallowed from ${parsedUrl.hostname}!`, - ); - } + await checkRobotsAccess(parsedUrl); } console.debug(init.method ?? "GET", parsedUrl.href); From a28eb03f8751f9588a7a5cdb35de093a42dc4ad3 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Tue, 24 Mar 2026 23:45:56 +0300 Subject: [PATCH 02/24] refactor: replace console.debug with console.log --- apps/worker/src/utils/fetchAsBot.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index 81b00e0..c6b97b5 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -81,7 +81,7 @@ export async function fetchAsBot(options: FetchAsBotInit) { await checkRobotsAccess(parsedUrl); } - console.debug(init.method ?? "GET", parsedUrl.href); + console.log(init.method ?? "GET", parsedUrl.href); const response = await request(url, { ...init, @@ -128,7 +128,7 @@ export async function fetchAsBotStream({ await checkRobotsAccess(parsedUrl); } - console.debug(init.method ?? "GET", parsedUrl.href); + console.log(init.method ?? "GET", parsedUrl.href); await stream( url, From e30f069fe757e7c9d398c1473976efc39f36bfb1 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Thu, 26 Mar 2026 15:55:10 +0300 Subject: [PATCH 03/24] feat: follow redirects in fetchAsBotStream --- apps/worker/src/utils/fetchAsBot.ts | 43 ++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index c6b97b5..1589b12 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -121,6 +121,7 @@ export async function fetchAsBotStream({ skipRobotsCheck, maxLength, writable, + followRedirects = 10, ...init }: FetchAsBotInit & { writable: Writable }) { const parsedUrl = url instanceof URL ? url : new URL(url); @@ -130,8 +131,48 @@ export async function fetchAsBotStream({ console.log(init.method ?? "GET", parsedUrl.href); + let crrUrl = url; + let redirectCount = 0; + while (true) { + const response = await request(crrUrl, { + ...init, + headers: { + "User-Agent": userAgent, + "Accept-Language": "en", + ...init?.headers, + }, + signal: init?.signal ?? AbortSignal.timeout(10 * 1000), + }); + + if (response.statusCode == 301 || response.statusCode == 302) { + await response.body.dump(); + const newLocation = response.headers["location"]?.toString(); + + if ( + redirectCount <= followRedirects && + newLocation && + URL.canParse(newLocation) + ) { + console.log( + `redirect (${response.statusCode}) [${crrUrl} -> ${newLocation}]`, + ); + const newUrl = new URL(newLocation); + crrUrl = newUrl; + redirectCount++; + continue; + } + } + + if (response.statusCode < 200 || response.statusCode > 299) { + await response.body.dump(); + throw new Error(`Request ${url} returned ${response.statusCode}`); + } + + break; + } + await stream( - url, + crrUrl, { ...init, headers: { From 55e618f81850c75f5a2f65805a33265e4d9fbb52 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Thu, 26 Mar 2026 22:59:43 +0300 Subject: [PATCH 04/24] fix: correct redirect count condition in fetchAsBotStream --- apps/worker/src/utils/fetchAsBot.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index 1589b12..3e65f57 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -149,7 +149,7 @@ export async function fetchAsBotStream({ const newLocation = response.headers["location"]?.toString(); if ( - redirectCount <= followRedirects && + redirectCount < followRedirects && newLocation && URL.canParse(newLocation) ) { From adb5875dc20612d3cc3a4e8208049b2a383e73c9 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Fri, 27 Mar 2026 02:16:01 +0300 Subject: [PATCH 05/24] feat: add repeatTimes parameter to mockEndpoint --- apps/worker/test-utils/server.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/worker/test-utils/server.ts b/apps/worker/test-utils/server.ts index 423453c..d7cffb2 100644 --- a/apps/worker/test-utils/server.ts +++ b/apps/worker/test-utils/server.ts @@ -14,6 +14,7 @@ interface MockEndpointProps { headers?: Record; method?: "get" | "post" | "put" | "delete"; status?: number; + repeatTimes?: number; } export function mockEndpoint({ @@ -22,6 +23,7 @@ export function mockEndpoint({ headers, method = "get", status = 200, + repeatTimes = 1, }: MockEndpointProps) { const url = path instanceof URL ? path : new URL(path); mockAgent @@ -32,5 +34,6 @@ export function mockEndpoint({ }) .reply(status, body, { headers, - }); + }) + .times(repeatTimes); } From fa5d085b1e29a88f782d6a3b7bdb45de2d635457 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Fri, 27 Mar 2026 02:16:32 +0300 Subject: [PATCH 06/24] test: add fetchAsBotStream tests --- apps/worker/src/utils/fetchAsBot.test.ts | 258 +++++++++++++++++------ 1 file changed, 193 insertions(+), 65 deletions(-) diff --git a/apps/worker/src/utils/fetchAsBot.test.ts b/apps/worker/src/utils/fetchAsBot.test.ts index 6e0460e..ea2f3ce 100644 --- a/apps/worker/src/utils/fetchAsBot.test.ts +++ b/apps/worker/src/utils/fetchAsBot.test.ts @@ -1,88 +1,216 @@ +import { Writable } from "stream"; import { mockEndpoint } from "../../test-utils/server.ts"; -import { RobotDeniedError, fetchAsBot, resetCache } from "./fetchAsBot.ts"; +import { + RobotDeniedError, + fetchAsBot, + fetchAsBotStream, + resetCache, +} from "./fetchAsBot.ts"; beforeEach(() => resetCache()); -test("Should throw error when a 400 status code is returned", async () => { - const robotsUrl = new URL("https://example.com/robots.txt"); - const url = new URL("https://example.com/test"); +describe("fetchAsBot", () => { + test("Should throw error when a 400 status code is returned", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); - mockEndpoint({ - path: robotsUrl, - body: "User-agent: *\nDisallow:\n", - headers: { - "content-type": "text/plain", - }, - }); - mockEndpoint({ - path: url, - body: "", - status: 400, + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: url, + body: "", + status: 400, + }); + + const response = fetchAsBot({ url, method: "GET" }); + await expect(response).rejects.toThrow(); }); - const response = fetchAsBot({ url, method: "GET" }); - await expect(response).rejects.toThrow(); -}); + test("Should return successful data for a URL with no robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); -test("Should return successful data for a URL with no robots.txt", async () => { - const robotsUrl = new URL("https://example.com/robots.txt"); - const url = new URL("https://example.com/test"); + mockEndpoint({ + path: robotsUrl, + status: 404, + body: "", + }); + mockEndpoint({ + path: url, + body: "Hello!", + }); - mockEndpoint({ - path: robotsUrl, - status: 404, - body: "", + const response = await fetchAsBot({ url, method: "GET" }); + const body = await response.body.text(); + expect(body).toBe("Hello!"); }); - mockEndpoint({ - path: url, - body: "Hello!", + + test("Should return successful data for a URL with a valid robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); + + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: url, + body: "Hello!", + }); + + const response = await fetchAsBot({ url, method: "GET" }); + const body = await response.body.text(); + expect(body).toBe("Hello!"); }); - const response = await fetchAsBot({ url, method: "GET" }); - const body = await response.body.text(); - expect(body).toBe("Hello!"); -}); + test("Should not return data for a URL disallowed by robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); -test("Should return successful data for a URL with a valid robots.txt", async () => { - const robotsUrl = new URL("https://example.com/robots.txt"); - const url = new URL("https://example.com/test"); + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow: /test\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: url, + body: "Hello!", + }); - mockEndpoint({ - path: robotsUrl, - body: "User-agent: *\nDisallow:\n", - headers: { - "content-type": "text/plain", - }, + const response = await fetchAsBot({ url, method: "GET" }).catch((e) => e); + expect(response).to.be.instanceOf(RobotDeniedError); + expect((response as Error).message).toBe( + "playful-programming/1.0 is disallowed from example.com!", + ); }); - mockEndpoint({ - path: url, - body: "Hello!", +}); + +describe("fetchAsBotStream", () => { + test("Should throw error when a 400 status code is returned", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); + + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: url, + body: "", + status: 400, + }); + + await expect( + fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(_, __, next) { + next(); + }, + }), + }), + ).rejects.toThrow(); }); - const response = await fetchAsBot({ url, method: "GET" }); - const body = await response.body.text(); - expect(body).toBe("Hello!"); -}); + test("Should return successful data for a URL with no robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); -test("Should not return data for a URL disallowed by robots.txt", async () => { - const robotsUrl = new URL("https://example.com/robots.txt"); - const url = new URL("https://example.com/test"); + mockEndpoint({ + path: robotsUrl, + status: 404, + body: "", + }); + mockEndpoint({ + path: url, + body: "Hello!", + repeatTimes: 2, + }); - mockEndpoint({ - path: robotsUrl, - body: "User-agent: *\nDisallow: /test\n", - headers: { - "content-type": "text/plain", - }, + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _, next) { + body += chunk; + next(); + }, + }), + }); + expect(body).toBe("Hello!"); }); - mockEndpoint({ - path: url, - body: "Hello!", + + test("Should return successful data for a URL with a valid robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); + + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: url, + body: "Hello!", + repeatTimes: 2, + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _, next) { + body += chunk; + next(); + }, + }), + }); + expect(body).toBe("Hello!"); }); - const response = await fetchAsBot({ url, method: "GET" }).catch((e) => e); - expect(response).to.be.instanceOf(RobotDeniedError); - expect((response as Error).message).toBe( - "playful-programming/1.0 is disallowed from example.com!", - ); + test("Should not return data for a URL disallowed by robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); + + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow: /test\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: url, + body: "Hello!", + repeatTimes: 2, + }); + + const error = await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable(), + }).catch((e) => e as Error); + expect(error).toBeInstanceOf(RobotDeniedError); + expect(error?.message).toBe( + `playful-programming/1.0 is disallowed from ${url.hostname}!`, + ); + }); }); From e74ebe1642e2cfa19d09ee03d5a8ea4ebb339a04 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Fri, 27 Mar 2026 22:24:40 +0300 Subject: [PATCH 07/24] test: fix tests to align with new fetchAsBotStream behavior --- apps/worker/src/tasks/url-metadata/utils/fetchPageHtml.test.ts | 3 +++ .../worker/src/tasks/url-metadata/utils/fetchPageIcons.test.ts | 3 +++ apps/worker/src/utils/fetchHtmlHead.test.ts | 1 + 3 files changed, 7 insertions(+) diff --git a/apps/worker/src/tasks/url-metadata/utils/fetchPageHtml.test.ts b/apps/worker/src/tasks/url-metadata/utils/fetchPageHtml.test.ts index d07c626..b00d889 100644 --- a/apps/worker/src/tasks/url-metadata/utils/fetchPageHtml.test.ts +++ b/apps/worker/src/tasks/url-metadata/utils/fetchPageHtml.test.ts @@ -23,6 +23,7 @@ test("should fetch page HTML", async () => { mockEndpoint({ path: domain.href, body: html, + repeatTimes: 2, }); const response = await fetchPageHtml(domain); @@ -98,6 +99,7 @@ test("should get page title", async () => { mockEndpoint({ path: domain, body: html, + repeatTimes: 2, }); const root = await fetchPageHtml(new URL(domain)); const response = await getPageTitle(root!); @@ -109,6 +111,7 @@ test("Should gather image URL from OpenGraph metadata", async () => { mockEndpoint({ path: domain, body: html, + repeatTimes: 2, }); const root = await fetchPageHtml(new URL(domain)); const response = await getOpenGraphImages(root!, new URL(domain)); diff --git a/apps/worker/src/tasks/url-metadata/utils/fetchPageIcons.test.ts b/apps/worker/src/tasks/url-metadata/utils/fetchPageIcons.test.ts index 71359dd..b88359c 100644 --- a/apps/worker/src/tasks/url-metadata/utils/fetchPageIcons.test.ts +++ b/apps/worker/src/tasks/url-metadata/utils/fetchPageIcons.test.ts @@ -19,6 +19,7 @@ test("Should fetch basic page icon", async () => { mockEndpoint({ path: domain, body: html, + repeatTimes: 2, }); const srcHast = await fetchPageHtml(new URL(domain)); const iconHref = await fetchPageIcons(new URL(domain), srcHast!); @@ -53,10 +54,12 @@ test("Should fetch page icon from manifest as backup", async () => { mockEndpoint({ path: domain, body: pageHtml, + repeatTimes: 2, }); mockEndpoint({ path: domain + "manifest.json", body: JSON.stringify(manifest), + repeatTimes: 2, }); const srcHast = await fetchPageHtml(new URL(domain)); diff --git a/apps/worker/src/utils/fetchHtmlHead.test.ts b/apps/worker/src/utils/fetchHtmlHead.test.ts index 91e5975..62065e0 100644 --- a/apps/worker/src/utils/fetchHtmlHead.test.ts +++ b/apps/worker/src/utils/fetchHtmlHead.test.ts @@ -23,6 +23,7 @@ test("Should return the HTML head when it exists", async () => { `.trim(), + repeatTimes: 2, }); const response = await fetchHtmlHead(new URL(url)); From cbdee204516c58c5e1c6c08b5b54b22f5f2e270e Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Fri, 27 Mar 2026 22:26:52 +0300 Subject: [PATCH 08/24] test: remove unused mockEndpoints in fetchAsBot.test.ts to prevent test side effects --- apps/worker/src/utils/fetchAsBot.test.ts | 9 --------- 1 file changed, 9 deletions(-) diff --git a/apps/worker/src/utils/fetchAsBot.test.ts b/apps/worker/src/utils/fetchAsBot.test.ts index ea2f3ce..317530b 100644 --- a/apps/worker/src/utils/fetchAsBot.test.ts +++ b/apps/worker/src/utils/fetchAsBot.test.ts @@ -82,10 +82,6 @@ describe("fetchAsBot", () => { "content-type": "text/plain", }, }); - mockEndpoint({ - path: url, - body: "Hello!", - }); const response = await fetchAsBot({ url, method: "GET" }).catch((e) => e); expect(response).to.be.instanceOf(RobotDeniedError); @@ -197,11 +193,6 @@ describe("fetchAsBotStream", () => { "content-type": "text/plain", }, }); - mockEndpoint({ - path: url, - body: "Hello!", - repeatTimes: 2, - }); const error = await fetchAsBotStream({ url, From a5fa24ae29e744957009a608b219357844208823 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Sun, 29 Mar 2026 00:00:38 +0300 Subject: [PATCH 09/24] refactor: use one fetch call instead of two duplicated ones to detect and follow the redirects in fetchAsBotStream --- apps/worker/src/utils/fetchAsBot.ts | 103 ++++++++++++++-------------- 1 file changed, 52 insertions(+), 51 deletions(-) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index 3e65f57..b3506d1 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -1,7 +1,9 @@ -import { request, stream, type Dispatcher } from "undici"; +import { createWriteStream } from "fs"; import { LRUCache } from "lru-cache"; +import { devNull } from "os"; import robotsParserDefault, { type Robot } from "robots-parser"; -import type { Writable } from "stream"; +import { type Writable } from "stream"; +import { request, stream, type Dispatcher } from "undici"; const robotsParser = robotsParserDefault as never as typeof robotsParserDefault.default; @@ -131,58 +133,57 @@ export async function fetchAsBotStream({ console.log(init.method ?? "GET", parsedUrl.href); - let crrUrl = url; - let redirectCount = 0; + const opaque = { + writable, + followRedirects, + url, + redirect: { + active: false, + count: 0, + }, + }; + while (true) { - const response = await request(crrUrl, { - ...init, - headers: { - "User-Agent": userAgent, - "Accept-Language": "en", - ...init?.headers, + await stream( + opaque.url, + { + ...init, + headers: { + "User-Agent": userAgent, + "Accept-Language": "en", + ...init?.headers, + }, + signal: init?.signal ?? AbortSignal.timeout(10 * 1000), + opaque, }, - signal: init?.signal ?? AbortSignal.timeout(10 * 1000), - }); - - if (response.statusCode == 301 || response.statusCode == 302) { - await response.body.dump(); - const newLocation = response.headers["location"]?.toString(); - - if ( - redirectCount < followRedirects && - newLocation && - URL.canParse(newLocation) - ) { - console.log( - `redirect (${response.statusCode}) [${crrUrl} -> ${newLocation}]`, - ); - const newUrl = new URL(newLocation); - crrUrl = newUrl; - redirectCount++; - continue; - } - } + ({ opaque, statusCode, headers }) => { + if ([301, 302, 303, 307, 308].includes(statusCode)) { + const newLocation = headers["location"]?.toString(); + if ( + opaque.redirect.count < opaque.followRedirects && + newLocation && + URL.canParse(newLocation) + ) { + console.log( + `redirect (${statusCode}) [${opaque.url} -> ${newLocation}]`, + ); + opaque.url = new URL(headers["location"] as string); + opaque.redirect.count++; + } + + return createWriteStream(devNull); + } + + if (statusCode < 200 || statusCode > 299) { + throw new Error(`Request ${url} returned ${statusCode}`); + } + + return opaque.writable; + }, + ); - if (response.statusCode < 200 || response.statusCode > 299) { - await response.body.dump(); - throw new Error(`Request ${url} returned ${response.statusCode}`); + if (!opaque.redirect.active) { + break; } - - break; } - - await stream( - crrUrl, - { - ...init, - headers: { - "User-Agent": userAgent, - "Accept-Language": "en", - ...init?.headers, - }, - signal: init?.signal ?? AbortSignal.timeout(10 * 1000), - opaque: writable, - }, - ({ opaque }) => opaque, - ); } From 61cdf24d68e523f2b13cc973a2630b985405f40e Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Sun, 29 Mar 2026 00:03:16 +0300 Subject: [PATCH 10/24] test: remove unnecessary repeatTimes property from mockEndpoint calls from tests related to fetchAsBotStream --- apps/worker/src/tasks/url-metadata/utils/fetchPageHtml.test.ts | 3 --- .../worker/src/tasks/url-metadata/utils/fetchPageIcons.test.ts | 3 --- apps/worker/src/utils/fetchAsBot.test.ts | 2 -- apps/worker/src/utils/fetchHtmlHead.test.ts | 1 - 4 files changed, 9 deletions(-) diff --git a/apps/worker/src/tasks/url-metadata/utils/fetchPageHtml.test.ts b/apps/worker/src/tasks/url-metadata/utils/fetchPageHtml.test.ts index b00d889..d07c626 100644 --- a/apps/worker/src/tasks/url-metadata/utils/fetchPageHtml.test.ts +++ b/apps/worker/src/tasks/url-metadata/utils/fetchPageHtml.test.ts @@ -23,7 +23,6 @@ test("should fetch page HTML", async () => { mockEndpoint({ path: domain.href, body: html, - repeatTimes: 2, }); const response = await fetchPageHtml(domain); @@ -99,7 +98,6 @@ test("should get page title", async () => { mockEndpoint({ path: domain, body: html, - repeatTimes: 2, }); const root = await fetchPageHtml(new URL(domain)); const response = await getPageTitle(root!); @@ -111,7 +109,6 @@ test("Should gather image URL from OpenGraph metadata", async () => { mockEndpoint({ path: domain, body: html, - repeatTimes: 2, }); const root = await fetchPageHtml(new URL(domain)); const response = await getOpenGraphImages(root!, new URL(domain)); diff --git a/apps/worker/src/tasks/url-metadata/utils/fetchPageIcons.test.ts b/apps/worker/src/tasks/url-metadata/utils/fetchPageIcons.test.ts index b88359c..71359dd 100644 --- a/apps/worker/src/tasks/url-metadata/utils/fetchPageIcons.test.ts +++ b/apps/worker/src/tasks/url-metadata/utils/fetchPageIcons.test.ts @@ -19,7 +19,6 @@ test("Should fetch basic page icon", async () => { mockEndpoint({ path: domain, body: html, - repeatTimes: 2, }); const srcHast = await fetchPageHtml(new URL(domain)); const iconHref = await fetchPageIcons(new URL(domain), srcHast!); @@ -54,12 +53,10 @@ test("Should fetch page icon from manifest as backup", async () => { mockEndpoint({ path: domain, body: pageHtml, - repeatTimes: 2, }); mockEndpoint({ path: domain + "manifest.json", body: JSON.stringify(manifest), - repeatTimes: 2, }); const srcHast = await fetchPageHtml(new URL(domain)); diff --git a/apps/worker/src/utils/fetchAsBot.test.ts b/apps/worker/src/utils/fetchAsBot.test.ts index 317530b..207fb57 100644 --- a/apps/worker/src/utils/fetchAsBot.test.ts +++ b/apps/worker/src/utils/fetchAsBot.test.ts @@ -134,7 +134,6 @@ describe("fetchAsBotStream", () => { mockEndpoint({ path: url, body: "Hello!", - repeatTimes: 2, }); let body = ""; @@ -165,7 +164,6 @@ describe("fetchAsBotStream", () => { mockEndpoint({ path: url, body: "Hello!", - repeatTimes: 2, }); let body = ""; diff --git a/apps/worker/src/utils/fetchHtmlHead.test.ts b/apps/worker/src/utils/fetchHtmlHead.test.ts index 62065e0..91e5975 100644 --- a/apps/worker/src/utils/fetchHtmlHead.test.ts +++ b/apps/worker/src/utils/fetchHtmlHead.test.ts @@ -23,7 +23,6 @@ test("Should return the HTML head when it exists", async () => { `.trim(), - repeatTimes: 2, }); const response = await fetchHtmlHead(new URL(url)); From 8922604992ef0af9a29c5ddf6bccf0c9ba668ca0 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Sun, 29 Mar 2026 01:05:21 +0300 Subject: [PATCH 11/24] test: clean up and refactor fetchAsBotStream tests --- apps/worker/src/utils/fetchAsBot.test.ts | 28 ++++++++++++------------ 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/apps/worker/src/utils/fetchAsBot.test.ts b/apps/worker/src/utils/fetchAsBot.test.ts index 207fb57..02cf81e 100644 --- a/apps/worker/src/utils/fetchAsBot.test.ts +++ b/apps/worker/src/utils/fetchAsBot.test.ts @@ -1,3 +1,5 @@ +import { createWriteStream } from "fs"; +import { devNull } from "os"; import { Writable } from "stream"; import { mockEndpoint } from "../../test-utils/server.ts"; import { @@ -109,17 +111,13 @@ describe("fetchAsBotStream", () => { status: 400, }); - await expect( - fetchAsBotStream({ - url, - method: "GET", - writable: new Writable({ - write(_, __, next) { - next(); - }, - }), - }), - ).rejects.toThrow(); + const botFetchStream = fetchAsBotStream({ + url, + method: "GET", + writable: createWriteStream(devNull), + }); + + await expect(botFetchStream).rejects.toThrow(); }); test("Should return successful data for a URL with no robots.txt", async () => { @@ -141,7 +139,7 @@ describe("fetchAsBotStream", () => { url, method: "GET", writable: new Writable({ - write(chunk, _, next) { + write(chunk, _encoding, next) { body += chunk; next(); }, @@ -171,12 +169,13 @@ describe("fetchAsBotStream", () => { url, method: "GET", writable: new Writable({ - write(chunk, _, next) { + write(chunk, _encoding, next) { body += chunk; next(); }, }), }); + expect(body).toBe("Hello!"); }); @@ -195,8 +194,9 @@ describe("fetchAsBotStream", () => { const error = await fetchAsBotStream({ url, method: "GET", - writable: new Writable(), + writable: createWriteStream(devNull), }).catch((e) => e as Error); + expect(error).toBeInstanceOf(RobotDeniedError); expect(error?.message).toBe( `playful-programming/1.0 is disallowed from ${url.hostname}!`, From 3f036cb0d7e70027e0a51637485da3fe73f96a96 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Mon, 30 Mar 2026 00:34:34 +0300 Subject: [PATCH 12/24] test: initialize mockAgent in beforeEach and assert for pending interceptors and cleanup in afterEach --- apps/worker/test-utils/server.ts | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/apps/worker/test-utils/server.ts b/apps/worker/test-utils/server.ts index d7cffb2..dfa5f03 100644 --- a/apps/worker/test-utils/server.ts +++ b/apps/worker/test-utils/server.ts @@ -1,12 +1,19 @@ import { MockAgent, setGlobalDispatcher } from "undici"; -const mockAgent = new MockAgent({ - connections: 1, - bodyTimeout: 10, - connectTimeout: 10, - headersTimeout: 10, +let mockAgent: MockAgent; +beforeEach(() => { + mockAgent = new MockAgent({ + connections: 1, + bodyTimeout: 10, + connectTimeout: 10, + headersTimeout: 10, + }); + setGlobalDispatcher(mockAgent); +}); +afterEach(async () => { + mockAgent.assertNoPendingInterceptors(); + await mockAgent.close(); }); -setGlobalDispatcher(mockAgent); interface MockEndpointProps { path: string | URL; From 9a9f6701339bcf84e722c1d1bc6d9dc559f56872 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Mon, 30 Mar 2026 00:41:54 +0300 Subject: [PATCH 13/24] fix: make method property of fetchAsBot more strict by forcing only uppercase method names --- apps/worker/src/utils/fetchAsBot.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index b3506d1..e1cb13a 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -56,9 +56,10 @@ async function checkRobotsAccess(url: URL) { type FetchAsBotInit = Omit< Dispatcher.RequestOptions, - "origin" | "path" + "origin" | "path" | "method" > & { url: string | URL; + method: "GET" | "POST" | "PUT" | "DELETE" | "HEAD"; skipRobotsCheck?: boolean; maxLength?: number; followRedirects?: number; From 8e04fdb249b659eeeabfc3b1867fcc81d804a384 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Sun, 5 Apr 2026 22:07:16 +0300 Subject: [PATCH 14/24] refactor: organize and refactor fetchAsBotStream function --- apps/worker/src/utils/fetchAsBot.ts | 89 ++++++++++++++++++----------- 1 file changed, 57 insertions(+), 32 deletions(-) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index e1cb13a..2492f49 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -119,6 +119,52 @@ export async function fetchAsBot(options: FetchAsBotInit) { return response; } +interface FetchAsBotStreamFactoryOpaque { + writable: Writable; + followRedirects: number; + currentUrl: string | URL; + redirect: boolean; + error: Error | null; +} + +const fetchAsBotStreamFactory: Dispatcher.StreamFactory< + FetchAsBotStreamFactoryOpaque +> = ({ opaque, statusCode, headers }) => { + opaque.redirect = false; + + if ( + [301, 302, 303, 307, 308].includes(statusCode) && + opaque.followRedirects > 0 + ) { + const newLocation = headers["location"]?.toString() ?? ""; + const newLocationUrl = URL.parse(newLocation, opaque.currentUrl); + if (newLocationUrl) { + console.log( + `redirect (${statusCode}) [${opaque.currentUrl} -> ${newLocationUrl}]`, + ); + opaque.currentUrl = newLocationUrl; + opaque.followRedirects -= 1; + opaque.redirect = true; + } else { + opaque.error = new Error( + `The redirect location ${newLocation} couldn't be parsed as a URL for ${opaque.currentUrl}`, + ); + } + + return createWriteStream(devNull); + } + + if (statusCode < 200 || statusCode > 299) { + opaque.error = new Error( + `Request ${opaque.currentUrl} returned ${statusCode}`, + ); + + return createWriteStream(devNull); + } + + return opaque.writable; +}; + export async function fetchAsBotStream({ url, skipRobotsCheck, @@ -134,19 +180,17 @@ export async function fetchAsBotStream({ console.log(init.method ?? "GET", parsedUrl.href); - const opaque = { + const opaque: FetchAsBotStreamFactoryOpaque = { writable, followRedirects, - url, - redirect: { - active: false, - count: 0, - }, + currentUrl: url, + redirect: false, + error: null, }; while (true) { await stream( - opaque.url, + opaque.currentUrl, { ...init, headers: { @@ -157,33 +201,14 @@ export async function fetchAsBotStream({ signal: init?.signal ?? AbortSignal.timeout(10 * 1000), opaque, }, - ({ opaque, statusCode, headers }) => { - if ([301, 302, 303, 307, 308].includes(statusCode)) { - const newLocation = headers["location"]?.toString(); - if ( - opaque.redirect.count < opaque.followRedirects && - newLocation && - URL.canParse(newLocation) - ) { - console.log( - `redirect (${statusCode}) [${opaque.url} -> ${newLocation}]`, - ); - opaque.url = new URL(headers["location"] as string); - opaque.redirect.count++; - } - - return createWriteStream(devNull); - } - - if (statusCode < 200 || statusCode > 299) { - throw new Error(`Request ${url} returned ${statusCode}`); - } - - return opaque.writable; - }, + fetchAsBotStreamFactory, ); - if (!opaque.redirect.active) { + if (opaque.error) { + throw opaque.error; + } + + if (!opaque.redirect) { break; } } From 2a9e76d213d497a025f918ad76956d59b60392a4 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Sun, 5 Apr 2026 22:24:14 +0300 Subject: [PATCH 15/24] feat: validate redirect protocol for redirects in fetchAsBotStream --- apps/worker/src/utils/fetchAsBot.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index 2492f49..505403c 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -142,6 +142,14 @@ const fetchAsBotStreamFactory: Dispatcher.StreamFactory< console.log( `redirect (${statusCode}) [${opaque.currentUrl} -> ${newLocationUrl}]`, ); + + if (!["https:", "http:"].includes(newLocationUrl.protocol)) { + opaque.error = new Error( + `Invalid redirect protocol for ${opaque.currentUrl}`, + ); + return createWriteStream(devNull); + } + opaque.currentUrl = newLocationUrl; opaque.followRedirects -= 1; opaque.redirect = true; From e730b7e8f6f78c6d2c5492707fc9980dfec46ad6 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Sun, 5 Apr 2026 22:32:14 +0300 Subject: [PATCH 16/24] fix: update currentUrl type to URL and use origin directly in URL.parse --- apps/worker/src/utils/fetchAsBot.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index 505403c..3ff6466 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -122,7 +122,7 @@ export async function fetchAsBot(options: FetchAsBotInit) { interface FetchAsBotStreamFactoryOpaque { writable: Writable; followRedirects: number; - currentUrl: string | URL; + currentUrl: URL; redirect: boolean; error: Error | null; } @@ -137,7 +137,7 @@ const fetchAsBotStreamFactory: Dispatcher.StreamFactory< opaque.followRedirects > 0 ) { const newLocation = headers["location"]?.toString() ?? ""; - const newLocationUrl = URL.parse(newLocation, opaque.currentUrl); + const newLocationUrl = URL.parse(newLocation, opaque.currentUrl.origin); if (newLocationUrl) { console.log( `redirect (${statusCode}) [${opaque.currentUrl} -> ${newLocationUrl}]`, @@ -191,7 +191,7 @@ export async function fetchAsBotStream({ const opaque: FetchAsBotStreamFactoryOpaque = { writable, followRedirects, - currentUrl: url, + currentUrl: parsedUrl, redirect: false, error: null, }; From 185f9088e24b308ab23861934865b347e6d20df3 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Sun, 5 Apr 2026 22:35:47 +0300 Subject: [PATCH 17/24] feat: update fetchAsBot to match implementation with fetchAsBotStream --- apps/worker/src/utils/fetchAsBot.ts | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index 3ff6466..cca30f7 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -96,18 +96,32 @@ export async function fetchAsBot(options: FetchAsBotInit) { signal: init?.signal ?? AbortSignal.timeout(10 * 1000), }); - if (response.statusCode == 301 || response.statusCode == 302) { + if ( + [301, 302, 303, 307, 308].includes(response.statusCode) && + followRedirects > 0 + ) { await response.body.dump(); - const newLocation = response.headers["location"]?.toString(); - console.log(`redirect (${response.statusCode})`); + const newLocation = response.headers["location"]?.toString() ?? ""; + const newLocationUrl = URL.parse(newLocation, parsedUrl.origin); + + if (newLocationUrl) { + console.log( + `redirect (${response.statusCode}) [${url} -> ${newLocationUrl}]`, + ); + + if (!["https:", "http:"].includes(newLocationUrl.protocol)) { + throw new Error(`Invalid redirect protocol for ${url}`); + } - if (followRedirects > 0 && newLocation && URL.canParse(newLocation)) { - const newUrl = new URL(newLocation); return await fetchAsBot({ ...options, - url: newUrl, + url: newLocationUrl, followRedirects: followRedirects - 1, }); + } else { + throw new Error( + `The redirect location ${newLocation} couldn't be parsed as a URL for ${url}`, + ); } } From 09538e2de289419ff2ae815d33f9719610a95882 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Sun, 5 Apr 2026 23:31:35 +0300 Subject: [PATCH 18/24] fix: move robots check inside the while loop to check for redirections too --- apps/worker/src/utils/fetchAsBot.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index cca30f7..e8bf6e8 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -196,9 +196,6 @@ export async function fetchAsBotStream({ ...init }: FetchAsBotInit & { writable: Writable }) { const parsedUrl = url instanceof URL ? url : new URL(url); - if (!skipRobotsCheck) { - await checkRobotsAccess(parsedUrl); - } console.log(init.method ?? "GET", parsedUrl.href); @@ -211,6 +208,10 @@ export async function fetchAsBotStream({ }; while (true) { + if (!skipRobotsCheck) { + await checkRobotsAccess(parsedUrl); + } + await stream( opaque.currentUrl, { From 2bb66669168192eff9786d6fdbb745996d278959 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Sun, 5 Apr 2026 23:35:01 +0300 Subject: [PATCH 19/24] refactor: update fetchAsBot to use parsedUrl for consistency --- apps/worker/src/utils/fetchAsBot.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index e8bf6e8..70893ac 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -86,7 +86,7 @@ export async function fetchAsBot(options: FetchAsBotInit) { console.log(init.method ?? "GET", parsedUrl.href); - const response = await request(url, { + const response = await request(parsedUrl, { ...init, headers: { "User-Agent": userAgent, @@ -106,11 +106,11 @@ export async function fetchAsBot(options: FetchAsBotInit) { if (newLocationUrl) { console.log( - `redirect (${response.statusCode}) [${url} -> ${newLocationUrl}]`, + `redirect (${response.statusCode}) [${parsedUrl} -> ${newLocationUrl}]`, ); if (!["https:", "http:"].includes(newLocationUrl.protocol)) { - throw new Error(`Invalid redirect protocol for ${url}`); + throw new Error(`Invalid redirect protocol for ${parsedUrl}`); } return await fetchAsBot({ @@ -120,14 +120,14 @@ export async function fetchAsBot(options: FetchAsBotInit) { }); } else { throw new Error( - `The redirect location ${newLocation} couldn't be parsed as a URL for ${url}`, + `The redirect location ${newLocation} couldn't be parsed as a URL for ${parsedUrl}`, ); } } if (response.statusCode < 200 || response.statusCode > 299) { await response.body.dump(); - throw new Error(`Request ${url} returned ${response.statusCode}`); + throw new Error(`Request ${parsedUrl} returned ${response.statusCode}`); } return response; From 0dc0d348d0ec7e3d1051ec4a506e407afc6ac3a7 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Wed, 8 Apr 2026 08:48:43 +0300 Subject: [PATCH 20/24] test: disable real network connections in tests to ensure all requests are mocked --- apps/worker/test-utils/server.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/worker/test-utils/server.ts b/apps/worker/test-utils/server.ts index dfa5f03..57c6486 100644 --- a/apps/worker/test-utils/server.ts +++ b/apps/worker/test-utils/server.ts @@ -8,6 +8,7 @@ beforeEach(() => { connectTimeout: 10, headersTimeout: 10, }); + mockAgent.disableNetConnect(); setGlobalDispatcher(mockAgent); }); afterEach(async () => { From 8391734ea0b3dd558f8f1d602b9963333a4bfa55 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Thu, 9 Apr 2026 00:43:39 +0300 Subject: [PATCH 21/24] fix: handle empty newLocation in redirects and add support for relative redirects --- apps/worker/src/utils/fetchAsBot.ts | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index 70893ac..76a8025 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -101,9 +101,11 @@ export async function fetchAsBot(options: FetchAsBotInit) { followRedirects > 0 ) { await response.body.dump(); - const newLocation = response.headers["location"]?.toString() ?? ""; - const newLocationUrl = URL.parse(newLocation, parsedUrl.origin); + const newLocation = response.headers["location"]?.toString(); + const newLocationUrl = newLocation + ? URL.parse(newLocation, parsedUrl) + : null; if (newLocationUrl) { console.log( `redirect (${response.statusCode}) [${parsedUrl} -> ${newLocationUrl}]`, @@ -150,8 +152,10 @@ const fetchAsBotStreamFactory: Dispatcher.StreamFactory< [301, 302, 303, 307, 308].includes(statusCode) && opaque.followRedirects > 0 ) { - const newLocation = headers["location"]?.toString() ?? ""; - const newLocationUrl = URL.parse(newLocation, opaque.currentUrl.origin); + const newLocation = headers["location"]?.toString(); + const newLocationUrl = newLocation + ? URL.parse(newLocation, opaque.currentUrl) + : null; if (newLocationUrl) { console.log( `redirect (${statusCode}) [${opaque.currentUrl} -> ${newLocationUrl}]`, From 14b9d71c07974a74542656c41456b9d674b22771 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Thu, 9 Apr 2026 00:43:53 +0300 Subject: [PATCH 22/24] fix: update robots access check to use currentUrl in fetchAsBotStream --- apps/worker/src/utils/fetchAsBot.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/worker/src/utils/fetchAsBot.ts b/apps/worker/src/utils/fetchAsBot.ts index 76a8025..5ec6009 100644 --- a/apps/worker/src/utils/fetchAsBot.ts +++ b/apps/worker/src/utils/fetchAsBot.ts @@ -213,7 +213,7 @@ export async function fetchAsBotStream({ while (true) { if (!skipRobotsCheck) { - await checkRobotsAccess(parsedUrl); + await checkRobotsAccess(opaque.currentUrl); } await stream( From 9206210b2020e7ea82d561678284732c6fbfe64d Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Thu, 9 Apr 2026 00:44:19 +0300 Subject: [PATCH 23/24] test: add tests for redirects in fetchAsBot and fetchAsBotStream --- apps/worker/src/utils/fetchAsBot.test.ts | 1206 +++++++++++++++++++++- 1 file changed, 1175 insertions(+), 31 deletions(-) diff --git a/apps/worker/src/utils/fetchAsBot.test.ts b/apps/worker/src/utils/fetchAsBot.test.ts index 02cf81e..3c646c9 100644 --- a/apps/worker/src/utils/fetchAsBot.test.ts +++ b/apps/worker/src/utils/fetchAsBot.test.ts @@ -1,6 +1,7 @@ import { createWriteStream } from "fs"; import { devNull } from "os"; import { Writable } from "stream"; +import { request } from "undici"; import { mockEndpoint } from "../../test-utils/server.ts"; import { RobotDeniedError, @@ -91,47 +92,977 @@ describe("fetchAsBot", () => { "playful-programming/1.0 is disallowed from example.com!", ); }); + + test("Should return data for a URL disallowed by robots.txt when skipRobotsCheck is true", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow: /test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("https://example.com/test"); + const mockedBody = "Hello!"; + mockEndpoint({ + path: url, + body: mockedBody, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + skipRobotsCheck: true, + }); + + expect(await response.body.text()).toBe(mockedBody); + + // Consume the remaining robots.txt mock so afterEach has no pending interceptors + await request(robotsUrl); + }); + + test("Should handle single redirect", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, baseUrl); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }); + + expect(await response.body.text()).toBe(redirectionBody); + }); + + test("Should handle chain of redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const redirectionBody = "This is the redirection result."; + for (let i = 1; i <= 3; i++) { + if (i === 3) { + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + continue; + } + + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: new URL(`/${i + 1}`, baseUrl).toString(), + }, + status: 301, + }); + } + + const response = await fetchAsBot({ + url: new URL("/1", baseUrl), + method: "GET", + }); + + expect(await response.body.text()).toBe(redirectionBody); + }); + + test("Should throw error when redirect limit is exceeded", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + for (let i = 1; i <= 12; i++) { + if (i === 12) { + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: "This is the redirection result.", + headers: { + "content-type": "text/plain", + }, + }); + continue; + } + + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: `/${i + 1}`, + }, + status: 302, + }); + } + + const response = await fetchAsBot({ + url: new URL("/1", baseUrl), + method: "GET", + }).catch((e) => e as Error); + + expect(response).toBeInstanceOf(Error); + + // Consume the remaining redirect mocks so afterEach has no pending interceptors + await request(new URL("/12", baseUrl)); + }); + + test("Should throw error when redirecting to invalid location", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const invalidLocation = "http://[invalid-url"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: invalidLocation, + }, + status: 303, + }); + + const response = await fetchAsBot({ + url: new URL("/test", baseUrl), + method: "GET", + }).catch((e) => e as Error); + + expect.assert( + response instanceof Error === true, + "Expected an error to be thrown", + ); + expect(response.message).toBe( + `The redirect location ${invalidLocation} couldn't be parsed as a URL for ${url}`, + ); + }); + + test("Should throw error when location header is missing on redirect", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + }, + status: 307, + }); + + const response = await fetchAsBot({ + url: new URL("/test", baseUrl), + method: "GET", + }).catch((e) => e as Error); + + expect.assert( + response instanceof Error === true, + "Expected an error to be thrown", + ); + expect(response.message).toBe( + `The redirect location undefined couldn't be parsed as a URL for ${url}`, + ); + }); + + test("Should throw error when redirecting to unsupported protocol", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: "ftp://example.com/file", + }, + status: 308, + }); + + const response = await fetchAsBot({ + url: new URL("/test", baseUrl), + method: "GET", + }).catch((e) => e as Error); + + expect.assert( + response instanceof Error === true, + "Expected an error to be thrown", + ); + expect(response.message).toBe(`Invalid redirect protocol for ${url}`); + }); + + test("Should handle absolute redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, baseUrl); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }); + + expect(await response.body.text()).toBe(redirectionBody); + }); + + test("Should handle relative redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test/", baseUrl); + const redirectPath = "another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, url); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }); + + expect(await response.body.text()).toBe(redirectionBody); + }); + + test("Should handle full URL redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectUrl = new URL("/another-test", baseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectUrl.toString(), + }, + status: 301, + }); + + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }); + + expect(await response.body.text()).toBe(redirectionBody); + }); + + test("Should handle cross-domain redirects", async () => { + const baseUrl = "https://example.com"; + const redirectBaseUrl = "https://example.net"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: new URL("/robots.txt", redirectBaseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectUrl = new URL("/test", redirectBaseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectUrl.toString(), + }, + status: 301, + }); + + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }); + + expect(await response.body.text()).toBe(redirectionBody); + }); + + test("Should throw error when redirected location in same domain is disallowed by robots.txt", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow: /another-test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, url); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }).catch((e) => e as Error); + + expect.assert( + response instanceof Error === true, + "Expected an error to be thrown", + ); + expect(response.message).toBe( + `playful-programming/1.0 is disallowed from ${url.hostname}!`, + ); + + // Consume the remaining redirect mock so afterEach has no pending interceptors + await request(redirectedUrl); + }); + + test("Should throw error when redirected location in different domain is disallowed by robots.txt", async () => { + const baseUrl = "https://example.com"; + const redirectBaseUrl = "https://example.net"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: new URL("/robots.txt", redirectBaseUrl), + body: "User-agent: *\nDisallow: /test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectUrl = new URL("/test", redirectBaseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectUrl.toString(), + }, + status: 301, + }); + + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + }).catch((e) => e as Error); + + expect.assert( + response instanceof Error === true, + "Expected an error to be thrown", + ); + expect(response.message).toBe( + `playful-programming/1.0 is disallowed from ${redirectUrl.hostname}!`, + ); + + // Consume the remaining redirect mock so afterEach has no pending interceptors + await request(redirectUrl); + }); }); -describe("fetchAsBotStream", () => { - test("Should throw error when a 400 status code is returned", async () => { - const robotsUrl = new URL("https://example.com/robots.txt"); - const url = new URL("https://example.com/test"); +describe("fetchAsBotStream", () => { + test("Should throw error when a 400 status code is returned", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); + + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: url, + body: "", + status: 400, + }); + + const botFetchStream = fetchAsBotStream({ + url, + method: "GET", + writable: createWriteStream(devNull), + }); + + await expect(botFetchStream).rejects.toThrow(); + }); + + test("Should return successful data for a URL with no robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); + + mockEndpoint({ + path: robotsUrl, + status: 404, + body: "", + }); + mockEndpoint({ + path: url, + body: "Hello!", + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + expect(body).toBe("Hello!"); + }); + + test("Should return successful data for a URL with a valid robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); + + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: url, + body: "Hello!", + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + + expect(body).toBe("Hello!"); + }); + + test("Should not return data for a URL disallowed by robots.txt", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + const url = new URL("https://example.com/test"); + + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow: /test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const error = await fetchAsBotStream({ + url, + method: "GET", + writable: createWriteStream(devNull), + }).catch((e) => e as Error); + + expect(error).toBeInstanceOf(RobotDeniedError); + expect(error?.message).toBe( + `playful-programming/1.0 is disallowed from ${url.hostname}!`, + ); + }); + + test("Should return data for a URL disallowed by robots.txt when skipRobotsCheck is true", async () => { + const robotsUrl = new URL("https://example.com/robots.txt"); + mockEndpoint({ + path: robotsUrl, + body: "User-agent: *\nDisallow: /test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("https://example.com/test"); + const mockedBody = "Hello!"; + mockEndpoint({ + path: url, + body: mockedBody, + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + skipRobotsCheck: true, + }); + + expect(body).toBe(mockedBody); + + // Consume the remaining robots.txt mock so afterEach has no pending interceptors + await request(robotsUrl); + }); + + test("Should handle single redirect", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, baseUrl); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + + expect(body).toBe(redirectionBody); + }); + + test("Should handle chain of redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const redirectionBody = "This is the redirection result."; + for (let i = 1; i <= 3; i++) { + if (i === 3) { + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + continue; + } + + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: new URL(`/${i + 1}`, baseUrl).toString(), + }, + status: 301, + }); + } + + let body = ""; + await fetchAsBotStream({ + url: new URL("/1", baseUrl), + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + + expect(body).toBe(redirectionBody); + }); + + test("Should throw error when redirect limit is exceeded", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + for (let i = 1; i <= 12; i++) { + if (i === 12) { + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: "This is the redirection result.", + headers: { + "content-type": "text/plain", + }, + }); + continue; + } + + mockEndpoint({ + path: new URL(`/${i}`, baseUrl), + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: `/${i + 1}`, + }, + status: 302, + }); + } + + let body = ""; + const error = await fetchAsBotStream({ + url: new URL("/1", baseUrl), + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }).catch((e) => e as Error); + + expect(body).toBe(""); + expect(error).toBeInstanceOf(Error); + + // Consume the remaining redirect mocks so afterEach has no pending interceptors + await request(new URL("/12", baseUrl)); + }); + + test("Should throw error when redirecting to invalid location", async () => { + const baseUrl = "https://example.com"; mockEndpoint({ - path: robotsUrl, + path: new URL("/robots.txt", baseUrl), body: "User-agent: *\nDisallow:\n", headers: { "content-type": "text/plain", }, }); + + const url = new URL("/test", baseUrl); + const invalidLocation = "http://[invalid-url"; mockEndpoint({ path: url, - body: "", - status: 400, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: invalidLocation, + }, + status: 303, }); - const botFetchStream = fetchAsBotStream({ - url, + const error = await fetchAsBotStream({ + url: new URL("/test", baseUrl), method: "GET", writable: createWriteStream(devNull), + }).catch((e) => e as Error); + + expect(error).toBeInstanceOf(Error); + expect(error?.message).toBe( + `The redirect location ${invalidLocation} couldn't be parsed as a URL for ${url}`, + ); + }); + + test("Should throw error when location header is missing on redirect", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, }); - await expect(botFetchStream).rejects.toThrow(); + const url = new URL("/test", baseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + }, + status: 307, + }); + + const error = await fetchAsBotStream({ + url: new URL("/test", baseUrl), + method: "GET", + writable: createWriteStream(devNull), + }).catch((e) => e as Error); + + expect(error).toBeInstanceOf(Error); + expect(error?.message).toBe( + `The redirect location undefined couldn't be parsed as a URL for ${url}`, + ); }); - test("Should return successful data for a URL with no robots.txt", async () => { - const robotsUrl = new URL("https://example.com/robots.txt"); - const url = new URL("https://example.com/test"); + test("Should throw error when redirecting to unsupported protocol", async () => { + const baseUrl = "https://example.com"; mockEndpoint({ - path: robotsUrl, - status: 404, - body: "", + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, }); + + const url = new URL("/test", baseUrl); mockEndpoint({ path: url, - body: "Hello!", + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: "ftp://example.com/file", + }, + status: 308, + }); + + const error = await fetchAsBotStream({ + url: new URL("/test", baseUrl), + method: "GET", + writable: createWriteStream(devNull), + }).catch((e) => e as Error); + + expect(error).toBeInstanceOf(Error); + expect(error?.message).toBe(`Invalid redirect protocol for ${url}`); + }); + + test("Should handle absolute redirects", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, baseUrl); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, }); let body = ""; @@ -145,23 +1076,41 @@ describe("fetchAsBotStream", () => { }, }), }); - expect(body).toBe("Hello!"); + + expect(body).toBe(redirectionBody); }); - test("Should return successful data for a URL with a valid robots.txt", async () => { - const robotsUrl = new URL("https://example.com/robots.txt"); - const url = new URL("https://example.com/test"); + test("Should handle relative redirects", async () => { + const baseUrl = "https://example.com"; mockEndpoint({ - path: robotsUrl, + path: new URL("/robots.txt", baseUrl), body: "User-agent: *\nDisallow:\n", headers: { "content-type": "text/plain", }, }); + + const url = new URL("/test/", baseUrl); + const redirectPath = "another-test"; mockEndpoint({ path: url, - body: "Hello!", + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, url); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, }); let body = ""; @@ -176,30 +1125,225 @@ describe("fetchAsBotStream", () => { }), }); - expect(body).toBe("Hello!"); + expect(body).toBe(redirectionBody); }); - test("Should not return data for a URL disallowed by robots.txt", async () => { - const robotsUrl = new URL("https://example.com/robots.txt"); - const url = new URL("https://example.com/test"); + test("Should handle full URL redirects", async () => { + const baseUrl = "https://example.com"; mockEndpoint({ - path: robotsUrl, - body: "User-agent: *\nDisallow: /test\n", + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectUrl = new URL("/another-test", baseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectUrl.toString(), + }, + status: 301, + }); + + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + + expect(body).toBe(redirectionBody); + }); + + test("Should handle cross-domain redirects", async () => { + const baseUrl = "https://example.com"; + const redirectBaseUrl = "https://example.net"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: new URL("/robots.txt", redirectBaseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectUrl = new URL("/test", redirectBaseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectUrl.toString(), + }, + status: 301, + }); + + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + let body = ""; + await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }); + + expect(body).toBe(redirectionBody); + }); + + test("Should throw error when redirected location in same domain is disallowed by robots.txt", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow: /another-test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: 301, + }); + + const redirectedUrl = new URL(redirectPath, url); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, headers: { "content-type": "text/plain", }, }); + let body = ""; const error = await fetchAsBotStream({ url, method: "GET", - writable: createWriteStream(devNull), + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), }).catch((e) => e as Error); - expect(error).toBeInstanceOf(RobotDeniedError); + expect(body).toBe(""); + expect(error).toBeInstanceOf(Error); expect(error?.message).toBe( `playful-programming/1.0 is disallowed from ${url.hostname}!`, ); + + // Consume the remaining redirect mock so afterEach has no pending interceptors + await request(redirectedUrl); + }); + + test("Should throw error when redirected location in different domain is disallowed by robots.txt", async () => { + const baseUrl = "https://example.com"; + const redirectBaseUrl = "https://example.net"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + mockEndpoint({ + path: new URL("/robots.txt", redirectBaseUrl), + body: "User-agent: *\nDisallow: /test\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectUrl = new URL("/test", redirectBaseUrl); + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectUrl.toString(), + }, + status: 301, + }); + + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + let body = ""; + const error = await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + }).catch((e) => e as Error); + + expect(body).toBe(""); + expect(error).toBeInstanceOf(Error); + expect(error?.message).toBe( + `playful-programming/1.0 is disallowed from ${redirectUrl.hostname}!`, + ); + + // Consume the remaining redirect mock so afterEach has no pending interceptors + await request(redirectUrl); }); }); From d644702251cedc7cc5e8198fcda4cdb1642c83e9 Mon Sep 17 00:00:00 2001 From: Furkan Emin Can <78358128+femincan@users.noreply.github.com> Date: Thu, 9 Apr 2026 14:15:41 +0300 Subject: [PATCH 24/24] test: add tests to ensure redirects are not followed when followRedirects is set to 0 --- apps/worker/src/utils/fetchAsBot.test.ts | 108 +++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/apps/worker/src/utils/fetchAsBot.test.ts b/apps/worker/src/utils/fetchAsBot.test.ts index 3c646c9..b6303d3 100644 --- a/apps/worker/src/utils/fetchAsBot.test.ts +++ b/apps/worker/src/utils/fetchAsBot.test.ts @@ -122,6 +122,56 @@ describe("fetchAsBot", () => { await request(robotsUrl); }); + test("Should not follow redirects when followRedirects is 0", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + const redirectStatus = 301; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: redirectStatus, + }); + + const redirectedUrl = new URL(redirectPath, baseUrl); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + const response = await fetchAsBot({ + url, + method: "GET", + followRedirects: 0, + }).catch((e) => e as Error); + + expect.assert( + response instanceof Error === true, + "Expected an error to be thrown", + ); + expect(response.message).toBe(`Request ${url} returned ${redirectStatus}`); + + // Consume the remaining redirect mock so afterEach has no pending interceptors + await request(redirectedUrl); + }); + test("Should handle single redirect", async () => { const baseUrl = "https://example.com"; @@ -781,6 +831,64 @@ describe("fetchAsBotStream", () => { await request(robotsUrl); }); + test("Should not follow redirects when followRedirects is 0", async () => { + const baseUrl = "https://example.com"; + + mockEndpoint({ + path: new URL("/robots.txt", baseUrl), + body: "User-agent: *\nDisallow:\n", + headers: { + "content-type": "text/plain", + }, + }); + + const url = new URL("/test", baseUrl); + const redirectPath = "/another-test"; + const redirectStatus = 301; + mockEndpoint({ + path: url, + body: "Redirecting..", + headers: { + "content-type": "text/plain", + location: redirectPath, + }, + status: redirectStatus, + }); + + const redirectedUrl = new URL(redirectPath, baseUrl); + const redirectionBody = "This is the redirection result."; + mockEndpoint({ + path: redirectedUrl, + body: redirectionBody, + headers: { + "content-type": "text/plain", + }, + }); + + let body = ""; + const error = await fetchAsBotStream({ + url, + method: "GET", + writable: new Writable({ + write(chunk, _encoding, next) { + body += chunk; + next(); + }, + }), + followRedirects: 0, + }).catch((e) => e as Error); + + expect.assert( + error instanceof Error === true, + "Expected an error to be thrown", + ); + expect(error.message).toBe(`Request ${url} returned ${redirectStatus}`); + expect(body).toBe(""); + + // Consume the remaining redirect mock so afterEach has no pending interceptors + await request(redirectedUrl); + }); + test("Should handle single redirect", async () => { const baseUrl = "https://example.com";