From 971ac6be8d3d215e4fd8635231d2404b740156aa Mon Sep 17 00:00:00 2001 From: YeonGyu-Kim Date: Thu, 25 Jun 2026 11:39:50 +0900 Subject: [PATCH] fix: preserve literal HTML entities --- src/webfetch/content.ts | 45 +++++++++++++++++------------------------ test/webfetch.test.ts | 37 +++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 26 deletions(-) diff --git a/src/webfetch/content.ts b/src/webfetch/content.ts index 4838930..d16fb86 100644 --- a/src/webfetch/content.ts +++ b/src/webfetch/content.ts @@ -130,16 +130,13 @@ function htmlFragmentToPlainText(html: string): string { function htmlFragmentToPlainTextFallback(html: string): string { return decodeHtmlEntities( - html - .replace(TAGS_TO_REMOVE, "") - .replace(VOID_TAGS_TO_REMOVE, "") - .replace(BLOCK_BREAK_TAGS, "\n") - .replace(TAGS, "") - .replace(WHITESPACE, " ") - .replace(/[ \t]+\n/g, "\n") - .replace(/\n[ \t]+/g, "\n") - .replace(NEWLINE_RUN, "\n\n") - .trim(), + normalizePlainText( + html + .replace(TAGS_TO_REMOVE, "") + .replace(VOID_TAGS_TO_REMOVE, "") + .replace(BLOCK_BREAK_TAGS, "\n") + .replace(TAGS, ""), + ), ); } @@ -196,25 +193,21 @@ function extractReadableArticle(html: string, url: string): ReadableArticle | un } function normalizePlainText(text: string): string { - return decodeHtmlEntities( - text - .replace(WHITESPACE, " ") - .replace(/[ \t]+\n/g, "\n") - .replace(/\n[ \t]+/g, "\n") - .replace(NEWLINE_RUN, "\n\n") - .trim(), - ); + return text + .replace(WHITESPACE, " ") + .replace(/[ \t]+\n/g, "\n") + .replace(/\n[ \t]+/g, "\n") + .replace(NEWLINE_RUN, "\n\n") + .trim(); } function normalizeMarkdown(markdown: string): string { - return decodeHtmlEntities( - markdown - .replace(/\r\n?/g, "\n") - .replace(/[ \t]+\n/g, "\n") - .replace(/\n[ \t]+/g, "\n") - .replace(NEWLINE_RUN, "\n\n") - .trim(), - ); + return markdown + .replace(/\r\n?/g, "\n") + .replace(/[ \t]+\n/g, "\n") + .replace(/\n[ \t]+/g, "\n") + .replace(NEWLINE_RUN, "\n\n") + .trim(); } export function decodeHtmlEntities(text: string): string { diff --git a/test/webfetch.test.ts b/test/webfetch.test.ts index 3071291..5a58019 100644 --- a/test/webfetch.test.ts +++ b/test/webfetch.test.ts @@ -110,6 +110,19 @@ function newlineFixtureHtml(): string { `; } +function literalEntityFixtureHtml(): string { + return ` + + +
+

Literal Entity Fixture

+

Rendered tag example: <custom-element>

+

Escaped ampersand example: AT&T docs

+
+ + `; +} + async function waitUntil(assertion: () => void): Promise { const deadline = Date.now() + 500; let lastError: unknown; @@ -347,6 +360,30 @@ describe("webfetch", () => { expect(text).not.toContain("첫 줄둘째 줄"); }); + it("#given literal HTML entity examples #when fetching markdown and text #then preserves one decoded layer only", async () => { + // given + const server = await createFixtureServer((_request, response) => { + response.writeHead(200, { "content-type": "text/html; charset=utf-8" }); + response.end(literalEntityFixtureHtml()); + }); + + // when + const markdown = textContent( + await executeWebfetch({ url: `${server.baseUrl}/literal-entity`, format: "markdown" }), + ); + const text = textContent(await executeWebfetch({ url: `${server.baseUrl}/literal-entity`, format: "text" })); + + // then + expect(markdown).toContain("<custom-element>"); + expect(markdown).toContain("AT&T docs"); + expect(markdown).not.toContain(""); + expect(markdown).not.toContain("AT&T docs"); + expect(text).toContain("<custom-element>"); + expect(text).toContain("AT&T docs"); + expect(text).not.toContain(""); + expect(text).not.toContain("AT&T docs"); + }); + it("#given html page #when fetching html #then returns raw html", async () => { // given const html = "

Raw

HTML

";