From a5a65a7f59a24dfcbaed8f4f517bf83ba1fc9bab Mon Sep 17 00:00:00 2001 From: YeonGyu-Kim Date: Thu, 25 Jun 2026 11:56:09 +0900 Subject: [PATCH] fix: skip readability after explicit article match --- src/webfetch/content.ts | 2 +- test/webfetch-explicit-article.test.ts | 45 ++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 test/webfetch-explicit-article.test.ts diff --git a/src/webfetch/content.ts b/src/webfetch/content.ts index d16fb86..d102379 100644 --- a/src/webfetch/content.ts +++ b/src/webfetch/content.ts @@ -169,12 +169,12 @@ function extractReadableArticle(html: string, url: string): ReadableArticle | un }; break; } + if (explicitArticle) return explicitArticle; const article = new Readability(dom.window.document, { charThreshold: 80, keepClasses: false, }).parse(); - if (explicitArticle) return explicitArticle; if (!article?.content || !article.textContent) return undefined; return { title: normalizePlainText( diff --git a/test/webfetch-explicit-article.test.ts b/test/webfetch-explicit-article.test.ts new file mode 100644 index 0000000..dc84bb0 --- /dev/null +++ b/test/webfetch-explicit-article.test.ts @@ -0,0 +1,45 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const readability = vi.hoisted(() => ({ + parse: vi.fn(() => { + throw new Error("Readability should not run for explicit article matches"); + }), +})); + +vi.mock("@mozilla/readability", () => ({ + Readability: class { + parse(): unknown { + return readability.parse(); + } + }, +})); + +import { htmlToMarkdown } from "../src/webfetch/content.js"; + +function explicitArticleHtml(): string { + return ` + + +

Explicit Article

+
+

Explicit article body has enough words to pass the direct article selector threshold.

+
+ + `; +} + +describe("webfetch explicit article extraction", () => { + beforeEach(() => { + readability.parse.mockClear(); + }); + + it("#given an explicit article container #when converting markdown #then skips Readability fallback parsing", () => { + // given / when + const markdown = htmlToMarkdown(explicitArticleHtml(), "https://example.test/post"); + + // then + expect(markdown).toContain("# Explicit Article"); + expect(markdown).toContain("Explicit article body"); + expect(readability.parse).not.toHaveBeenCalled(); + }); +});