diff --git a/src/webfetch/content.ts b/src/webfetch/content.ts index fba9a0b..4838930 100644 --- a/src/webfetch/content.ts +++ b/src/webfetch/content.ts @@ -12,9 +12,48 @@ const TAGS_TO_REMOVE = /<(script|style|noscript|iframe|object|embed|meta|link)\b const VOID_TAGS_TO_REMOVE = /<(script|style|noscript|iframe|object|embed|meta|link)\b[^>]*\/?>/gi; const BLOCK_BREAK_TAGS = /<\/?(address|article|aside|blockquote|br|dd|div|dl|dt|figcaption|figure|footer|h[1-6]|header|hr|li|main|nav|ol|p|pre|section|table|tbody|td|tfoot|th|thead|tr|ul)\b[^>]*>/gi; +const BLOCK_BREAK_SELECTOR = + "address, article, aside, blockquote, dd, div, dl, dt, figcaption, figure, footer, h1, h2, h3, h4, h5, h6, header, hr, li, main, nav, ol, p, pre, section, table, tbody, tfoot, thead, tr, ul"; +const CELL_BREAK_SELECTOR = "td, th"; const TAGS = /<[^>]+>/g; -const WHITESPACE = /[\t\f\v ]+/g; +const WHITESPACE = /[\t\f\v \u00a0]+/g; const NEWLINE_RUN = /\n{3,}/g; +const MIN_EXPLICIT_ARTICLE_TEXT_LENGTH = 30; +const EXPLICIT_ARTICLE_SELECTORS = [ + ".article_view", + ".tt_article_useless_p_margin", + ".entry-content", + ".contents_style", + ".post-content", + ".article-content", + ".content-article", + "#content .contents_style", +]; +const ARTICLE_NOISE_SELECTOR = [ + "script", + "style", + "noscript", + "iframe", + "object", + "embed", + "meta", + "link", + "nav", + "aside", + "footer", + ".another_category", + ".area_related", + ".related", + ".revenue_unit_wrap", + ".adsbygoogle", + ".container_postbtn", + ".postbtn_like", + ".comments", + ".comment", + ".tagTrail", + ".sidebar", +].join(", "); +const TITLE_SELECTOR = "h1, .tit_post, .entry-title, .post-title, .article-title"; const ENTITIES: Readonly> = { amp: "&", @@ -36,9 +75,9 @@ turndownService.remove(["script", "style", "noscript", "iframe", "object", "embe export function htmlToMarkdown(html: string, url: string): string { const article = extractReadableArticle(html, url); - if (!article) return turndownService.turndown(html).trim(); + if (!article) return normalizeMarkdown(turndownService.turndown(html)); - const markdown = turndownService.turndown(article.content).trim(); + const markdown = normalizeMarkdown(turndownService.turndown(article.content)); if (!article.title || article.hasHeading || markdown.startsWith(`# ${article.title}`)) return markdown; return `# ${article.title}\n\n${markdown}`.trim(); } @@ -56,6 +95,40 @@ export function htmlToText(html: string, url: string): string { } function htmlFragmentToPlainText(html: string): string { + try { + const dom = new JSDOM(`${html}`, { + contentType: "text/html", + virtualConsole: new VirtualConsole(), + }); + try { + const document = dom.window.document; + for (const element of document.querySelectorAll( + "script, style, noscript, iframe, object, embed, meta, link", + )) { + element.remove(); + } + for (const element of document.querySelectorAll("br")) { + element.replaceWith(document.createTextNode("\n")); + } + for (const element of document.querySelectorAll(CELL_BREAK_SELECTOR)) { + element.after(document.createTextNode("\n")); + } + for (const element of document.querySelectorAll(BLOCK_BREAK_SELECTOR)) { + element.before(document.createTextNode("\n")); + element.after(document.createTextNode("\n")); + } + return normalizePlainText(document.body.textContent ?? ""); + } finally { + dom.window.close(); + } + } catch (error) { + if (!(error instanceof Error)) throw error; + } + + return htmlFragmentToPlainTextFallback(html); +} + +function htmlFragmentToPlainTextFallback(html: string): string { return decodeHtmlEntities( html .replace(TAGS_TO_REMOVE, "") @@ -78,13 +151,38 @@ function extractReadableArticle(html: string, url: string): ReadableArticle | un virtualConsole: new VirtualConsole(), }); try { + let explicitArticle: ReadableArticle | undefined; + for (const selector of EXPLICIT_ARTICLE_SELECTORS) { + const candidate = dom.window.document.querySelector(selector); + if (!candidate) continue; + const clonedNode = candidate.cloneNode(true); + if (!(clonedNode instanceof dom.window.Element)) continue; + for (const noisyElement of clonedNode.querySelectorAll(ARTICLE_NOISE_SELECTOR)) { + noisyElement.remove(); + } + const text = normalizePlainText(clonedNode.textContent ?? ""); + if (text.length < MIN_EXPLICIT_ARTICLE_TEXT_LENGTH) continue; + const title = normalizePlainText( + dom.window.document.querySelector(TITLE_SELECTOR)?.textContent ?? dom.window.document.title, + ); + explicitArticle = { + title, + content: clonedNode.innerHTML, + hasHeading: / { if (entity.startsWith("#x")) { diff --git a/src/webfetch/fetcher.ts b/src/webfetch/fetcher.ts index 54c1d01..8f878b5 100644 --- a/src/webfetch/fetcher.ts +++ b/src/webfetch/fetcher.ts @@ -1,3 +1,6 @@ +import type { IncomingHttpHeaders } from "node:http"; +import { request } from "undici"; + import { InvalidWebfetchUrlError, WebfetchAbortError, @@ -8,9 +11,12 @@ import { export const MAX_RESPONSE_SIZE_BYTES = 5 * 1024 * 1024; export const DEFAULT_TIMEOUT_SECONDS = 30; export const MAX_TIMEOUT_SECONDS = 120; +const MAX_REDIRECTS = 20; const BROWSER_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36"; +const CHROME_MAJOR_VERSION = "143"; +const REDIRECT_STATUSES = new Set([301, 302, 303, 307, 308]); export type WebfetchFormat = "markdown" | "text" | "html"; @@ -31,33 +37,40 @@ export interface FetchResult { truncated: boolean; } +interface HttpResponse { + readonly url: string; + readonly status: number; + readonly statusText: string; + readonly headers: IncomingHttpHeaders; + readonly body: ResponseBodyStream; +} + +interface ResponseBodyStream extends AsyncIterable { + destroy(error?: Error): void; + dump(options?: { limit: number; signal?: AbortSignal }): Promise; +} + export async function fetchUrl(options: FetchOptions): Promise { validateUrl(options.url); const timeoutSeconds = clampTimeout(options.timeoutSeconds); + const timeoutMs = timeoutSeconds * 1000; const controller = new AbortController(); const timeout = setTimeout( () => controller.abort(new WebfetchTimeoutError(`Request timed out after ${timeoutSeconds}s`)), - timeoutSeconds * 1000, + timeoutMs, ); const removeAbortForwarder = forwardAbort(options.signal, controller); try { - const response = await fetch(options.url, { - headers: buildHeaders(options.format, BROWSER_USER_AGENT), + const response = await requestUrl({ + url: options.url, + format: options.format, signal: controller.signal, + timeoutMs, }); - if (response.status === 403 && response.headers.get("cf-mitigated") === "challenge") { - await cancelBody(response); - const retry = await fetch(options.url, { - headers: buildHeaders(options.format, "pi-webfetch"), - signal: controller.signal, - }); - return await readFetchResponse(options.url, retry, controller.signal); - } - - return await readFetchResponse(options.url, response, controller.signal); + return await readHttpResponse(response, controller.signal); } finally { clearTimeout(timeout); removeAbortForwarder(); @@ -93,68 +106,143 @@ export function buildAcceptHeader(format: WebfetchFormat): string { } } -function buildHeaders(format: WebfetchFormat, userAgent: string): HeadersInit { +function buildHeaders(format: WebfetchFormat): Record { return { Accept: buildAcceptHeader(format), "Accept-Language": "en-US,en;q=0.9", - "User-Agent": userAgent, + "Sec-CH-UA": `"Google Chrome";v="${CHROME_MAJOR_VERSION}", "Chromium";v="${CHROME_MAJOR_VERSION}", "Not A(Brand";v="24"`, + "Sec-CH-UA-Mobile": "?0", + "Sec-CH-UA-Platform": '"Windows"', + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + "Upgrade-Insecure-Requests": "1", + "User-Agent": BROWSER_USER_AGENT, }; } -async function readFetchResponse(url: string, response: Response, signal: AbortSignal): Promise { +interface RequestUrlOptions { + readonly url: string; + readonly format: WebfetchFormat; + readonly signal: AbortSignal; + readonly timeoutMs: number; +} + +async function requestUrl(options: RequestUrlOptions): Promise { + const headers = buildHeaders(options.format); + let currentUrl = options.url; + + for (let redirectCount = 0; redirectCount <= MAX_REDIRECTS; redirectCount += 1) { + const response = await request(currentUrl, { + method: "GET", + headers, + signal: options.signal, + headersTimeout: options.timeoutMs, + bodyTimeout: options.timeoutMs, + }); + + if (!REDIRECT_STATUSES.has(response.statusCode)) { + return { + url: currentUrl, + status: response.statusCode, + statusText: response.statusText, + headers: response.headers, + body: response.body, + }; + } + + const location = getHeader(response.headers, "location"); + if (!location) { + return { + url: currentUrl, + status: response.statusCode, + statusText: response.statusText, + headers: response.headers, + body: response.body, + }; + } + + if (redirectCount === MAX_REDIRECTS) { + return { + url: currentUrl, + status: response.statusCode, + statusText: response.statusText, + headers: response.headers, + body: response.body, + }; + } + + await discardBody(response.body); + currentUrl = new URL(location, currentUrl).toString(); + } + + throw new WebfetchAbortError("Redirect resolution aborted"); +} + +async function readHttpResponse(response: HttpResponse, signal: AbortSignal): Promise { await rejectOversizedContentLength(response); const body = await readResponseBody(response, signal); return { - url: response.url || url, + url: response.url, status: response.status, statusText: response.statusText, - contentType: response.headers.get("content-type") ?? "", + contentType: getHeader(response.headers, "content-type"), bytes: body.length, body, truncated: body.length === MAX_RESPONSE_SIZE_BYTES, }; } -async function rejectOversizedContentLength(response: Response): Promise { - const contentLength = response.headers.get("content-length"); +async function rejectOversizedContentLength(response: HttpResponse): Promise { + const contentLength = getHeader(response.headers, "content-length"); if (contentLength && Number.parseInt(contentLength, 10) > MAX_RESPONSE_SIZE_BYTES) { - await cancelBody(response); + await discardBody(response.body); throw new WebfetchResponseTooLargeError("Response too large (exceeds 5MB limit)"); } } -async function cancelBody(response: Response): Promise { +function getHeader(headers: IncomingHttpHeaders, name: string): string { + const value = headers[name.toLowerCase()]; + if (Array.isArray(value)) return value.join(", "); + return value ?? ""; +} + +async function discardBody(body: ResponseBodyStream): Promise { try { - await response.body?.cancel(); - } catch { - // Preserve the caller's original failure. + await body.dump({ limit: 1024 }); + } catch (error) { + if (error instanceof Error) { + body.destroy(error); + return; + } + throw error; } } -async function readResponseBody(response: Response, signal: AbortSignal): Promise { - if (!response.body) return new Uint8Array(); - - const reader = response.body.getReader(); +async function readResponseBody(response: HttpResponse, signal: AbortSignal): Promise { const chunks: Uint8Array[] = []; let total = 0; try { - while (true) { + for await (const chunk of response.body) { if (signal.aborted) { - await cancelReader(reader); + response.body.destroy(); throw new WebfetchAbortError("Request aborted"); } - const read = await reader.read(); - if (read.done) break; - chunks.push(read.value); - total += read.value.length; + const bytes = toUint8Array(chunk); + chunks.push(bytes); + total += bytes.length; if (total > MAX_RESPONSE_SIZE_BYTES) { - await cancelReader(reader); + response.body.destroy(); throw new WebfetchResponseTooLargeError("Response too large (exceeds 5MB limit)"); } } - } finally { - reader.releaseLock(); + } catch (error) { + if (signal.aborted) { + throw new WebfetchAbortError("Request aborted"); + } + throw error; } const body = new Uint8Array(total); @@ -166,12 +254,10 @@ async function readResponseBody(response: Response, signal: AbortSignal): Promis return body; } -async function cancelReader(reader: ReadableStreamDefaultReader): Promise { - try { - await reader.cancel(); - } catch { - // Preserve the caller's original failure. - } +function toUint8Array(chunk: unknown): Uint8Array { + if (chunk instanceof Uint8Array) return chunk; + if (typeof chunk === "string") return new TextEncoder().encode(chunk); + throw new Error("Unexpected response body chunk"); } function forwardAbort(signal: AbortSignal | undefined, controller: AbortController): () => void { diff --git a/test/webfetch.test.ts b/test/webfetch.test.ts index f0be352..3071291 100644 --- a/test/webfetch.test.ts +++ b/test/webfetch.test.ts @@ -6,6 +6,7 @@ import { MAX_RESPONSE_SIZE_BYTES } from "../src/webfetch/fetcher.js"; import { webfetch } from "../src/webfetch/tool.js"; type RouteHandler = (request: IncomingMessage, response: ServerResponse) => void; +type CapturedHeaders = IncomingMessage["headers"]; const servers: Server[] = []; @@ -34,6 +35,12 @@ function textContent(result: Awaited>): strin return first.text; } +function headerValue(headers: CapturedHeaders, name: string): string { + const value = headers[name.toLowerCase()]; + if (Array.isArray(value)) return value.join(", "); + return value ?? ""; +} + afterEach(async () => { await Promise.all(servers.splice(0).map(closeServer)); }); @@ -42,6 +49,67 @@ function closeServer(server: Server): Promise { return new Promise((resolve, reject) => server.close((error) => (error ? reject(error) : resolve()))); } +function tistoryFixtureHtml(): string { + return ` + + + 관리자 메뉴가 제목을 이기면 안 됨 + + + +
+ 관리자 + 분류 전체보기 +
+ +
+

티스토리 본문을 읽어야 합니다

+
+
+

첫 번째 본문 문장은 짧은 티스토리 글에서도 반드시 남아야 합니다.

+

두 번째 본문 문장은 카테고리나 관련 글보다 우선되어야 합니다.

+
+
본문 이미지 설명도 보존됩니다.
+
+
+
+

다른 글 보기

+
    +
  • 관련 글 제목 하나
  • +
  • 관련 글 제목 둘
  • +
+
+
+
+
구독하기 푸터와 방명록 링크
+ + + `; +} + +function newlineFixtureHtml(): string { + return ` + + +
+

줄바꿈 보존

+

첫 줄
둘째 줄

+

새 문단 강조

+
    +
  • 첫 항목
  • +
  • 둘째 항목
  • +
+ + +
왼쪽 칸오른쪽 칸
+
+ + `; +} + async function waitUntil(assertion: () => void): Promise { const deadline = Date.now() + 500; let lastError: unknown; @@ -152,6 +220,53 @@ describe("webfetch", () => { expect(text).not.toContain("window.tracker"); }); + it("#given a web page #when fetching markdown #then sends browser navigation headers", async () => { + // given + let capturedHeaders: CapturedHeaders | undefined; + const server = await createFixtureServer((request, response) => { + capturedHeaders = request.headers; + response.writeHead(200, { "content-type": "text/html; charset=utf-8" }); + response.end(tistoryFixtureHtml()); + }); + + // when + await executeWebfetch({ url: `${server.baseUrl}/headers`, format: "markdown" }); + + // then + expect(capturedHeaders).toBeDefined(); + if (!capturedHeaders) throw new Error("Expected captured request headers"); + expect(headerValue(capturedHeaders, "user-agent")).toContain("Mozilla/5.0"); + expect(headerValue(capturedHeaders, "accept")).toContain("text/markdown"); + expect(headerValue(capturedHeaders, "accept-language")).toBe("en-US,en;q=0.9"); + expect(headerValue(capturedHeaders, "sec-fetch-mode")).toBe("navigate"); + expect(headerValue(capturedHeaders, "sec-fetch-dest")).toBe("document"); + expect(headerValue(capturedHeaders, "sec-ch-ua-platform")).toBe('"Windows"'); + }); + + it("#given Tistory article wrappers #when fetching markdown #then prefers the article body over category chrome", async () => { + // given + const server = await createFixtureServer((_request, response) => { + response.writeHead(200, { "content-type": "text/html; charset=utf-8" }); + response.end(tistoryFixtureHtml()); + }); + + // when + const result = await executeWebfetch({ url: `${server.baseUrl}/tistory`, format: "markdown" }); + const text = textContent(result); + + // then + expect(text).toContain("# 티스토리 본문을 읽어야 합니다"); + expect(text).toContain("첫 번째 본문 문장은"); + expect(text).toContain("두 번째 본문 문장은"); + expect(text).toContain("본문 이미지 설명도 보존됩니다"); + expect(text).not.toContain("관리자 메뉴가 제목을 이기면 안 됨"); + expect(text).not.toContain("분류 전체보기"); + expect(text).not.toContain("최근 글"); + expect(text).not.toContain("관련 글 제목"); + expect(text).not.toContain("구독하기 푸터"); + expect(text).not.toContain("tistoryTracker"); + }); + it("#given html page #when fetching text #then returns readable text without tags", async () => { // given const server = await createFixtureServer((_request, response) => { @@ -213,6 +328,25 @@ describe("webfetch", () => { expect(result.details?.format).toBe("text"); }); + it("#given Tistory text with inline spans and blocks #when fetching text #then preserves readable line breaks", async () => { + // given + const server = await createFixtureServer((_request, response) => { + response.writeHead(200, { "content-type": "text/html; charset=utf-8" }); + response.end(newlineFixtureHtml()); + }); + + // when + const result = await executeWebfetch({ url: `${server.baseUrl}/newline`, format: "text" }); + const text = textContent(result); + + // then + expect(text).toContain("줄바꿈 보존\n\n첫 줄\n둘째 줄\n\n새 문단 강조"); + expect(text).toContain("첫 항목\n\n둘째 항목"); + expect(text).toContain("왼쪽 칸\n오른쪽 칸"); + expect(text).not.toContain("\n\n\n"); + expect(text).not.toContain("첫 줄둘째 줄"); + }); + it("#given html page #when fetching html #then returns raw html", async () => { // given const html = "

Raw

HTML

"; @@ -291,31 +425,53 @@ describe("webfetch", () => { expect(result.details?.truncated).toBe(true); }); - it("#given Cloudflare challenge #when retrying #then closes the challenged response", async () => { + it("#given Cloudflare challenge response #when fetching #then does not retry with a bot identity", async () => { // given - let challengeClosed = false; - let requests = 0; - const server = await createFixtureServer((_request, response) => { - requests += 1; - if (requests === 1) { - response.writeHead(403, { "cf-mitigated": "challenge", "content-type": "text/html" }); - response.write("

challenge

"); - response.on("close", () => { - challengeClosed = true; + const attempts: CapturedHeaders[] = []; + const server = await createFixtureServer((request, response) => { + attempts.push(request.headers); + if (attempts.length === 1) { + response.writeHead(403, { + "cf-mitigated": "challenge", + "content-type": "text/html; charset=utf-8", }); + response.end("challenge"); return; } - response.writeHead(200, { "content-type": "text/plain" }); + response.writeHead(200, { "content-type": "text/plain; charset=utf-8" }); response.end("retried"); }); // when - const result = await executeWebfetch({ url: `${server.baseUrl}/challenge`, format: "text" }); + await executeWebfetch({ url: `${server.baseUrl}/challenge`, format: "text" }); + + // then + expect(attempts).toHaveLength(1); + const challengeHeaders = attempts[0]; + if (!challengeHeaders) throw new Error("Expected challenge request headers"); + expect(headerValue(challengeHeaders, "user-agent")).toContain("Mozilla/5.0"); + expect(headerValue(challengeHeaders, "user-agent")).not.toContain("pi-webfetch"); + expect(headerValue(challengeHeaders, "sec-fetch-mode")).toBe("navigate"); + expect(headerValue(challengeHeaders, "sec-fetch-dest")).toBe("document"); + expect(headerValue(challengeHeaders, "sec-ch-ua-platform")).toBe('"Windows"'); + }); + + it("#given too many redirects #when fetching #then returns the final redirect response body", async () => { + // given + const server = await createFixtureServer((_request, response) => { + response.writeHead(302, { + location: "/loop", + "content-type": "text/plain; charset=utf-8", + }); + response.end("redirect limit reached"); + }); + + // when + const result = await executeWebfetch({ url: `${server.baseUrl}/loop`, format: "text" }); + const text = textContent(result); // then - expect(textContent(result)).toBe("retried"); - expect(requests).toBe(2); - expect(challengeClosed).toBe(true); + expect(text).toContain("redirect limit reached"); }); });