Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions src/webfetch/content.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ const EXPLICIT_ARTICLE_SELECTORS = [
".content-article",
"#content .contents_style",
];
const TITLE_SELECTORS = [".tit_post", ".entry-title", ".post-title", ".article-title", "h1"];
const ARTICLE_NOISE_SELECTOR = [
"script",
"style",
Expand All @@ -53,8 +54,6 @@ const ARTICLE_NOISE_SELECTOR = [
".tagTrail",
".sidebar",
].join(", ");
const TITLE_SELECTOR = "h1, .tit_post, .entry-title, .post-title, .article-title";

const ENTITIES: Readonly<Record<string, string>> = {
amp: "&",
apos: "'",
Expand Down Expand Up @@ -159,11 +158,8 @@ function extractReadableArticle(html: string, url: string): ReadableArticle | un
}
const text = normalizePlainText(clonedNode.textContent ?? "");
if (text.length < MIN_EXPLICIT_ARTICLE_TEXT_LENGTH) continue;
const title = normalizePlainText(
dom.window.document.querySelector(TITLE_SELECTOR)?.textContent ?? dom.window.document.title,
);
explicitArticle = {
title,
title: selectPreferredTitle(dom.window.document, dom.window.document.title),
content: clonedNode.innerHTML,
hasHeading: /<h[1-6]\b/i.test(clonedNode.innerHTML),
};
Expand All @@ -177,9 +173,7 @@ function extractReadableArticle(html: string, url: string): ReadableArticle | un
}).parse();
if (!article?.content || !article.textContent) return undefined;
return {
title: normalizePlainText(
dom.window.document.querySelector(TITLE_SELECTOR)?.textContent ?? article.title ?? "",
),
title: selectPreferredTitle(dom.window.document, article.title ?? ""),
content: article.content,
hasHeading: /<h[1-6]\b/i.test(article.content),
};
Expand All @@ -192,6 +186,14 @@ function extractReadableArticle(html: string, url: string): ReadableArticle | un
}
}

function selectPreferredTitle(document: Document, fallback: string): string {
for (const selector of TITLE_SELECTORS) {
const title = normalizePlainText(document.querySelector(selector)?.textContent ?? "");
if (title) return title;
}
return normalizePlainText(fallback);
}

function normalizePlainText(text: string): string {
return text
.replace(WHITESPACE, " ")
Expand Down
69 changes: 69 additions & 0 deletions test/webfetch.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,37 @@ function tistoryFixtureHtml(): string {
</html>`;
}

function titlePriorityFixtureHtml(): string {
return `<!doctype html>
<html>
<head>
<title>관리자 메뉴가 제목을 이기면 안 됨</title>
<meta name="description" content="티스토리 블로그 홍보 문구">
</head>
<body class="tt-body-page">
<header>
<h1>블로그 이름</h1>
<a href="/manage">관리자</a>
<a href="/category">분류 전체보기</a>
</header>
<section class="sidebar">
<h2>최근 글</h2>
<p>관련 없는 사이드바 설명이 길게 들어가서 리더가 이 영역을 본문으로 착각하면 안 됩니다.</p>
</section>
<div id="content">
<h1 class="tit_post">티스토리 본문을 읽어야 합니다</h1>
<div class="entry-content contents_style">
<div class="article_view tt_article_useless_p_margin">
<p data-ke-size="size16">첫 번째 본문 문장은 짧은 티스토리 글에서도 반드시 남아야 합니다.</p>
<p data-ke-size="size16">두 번째 본문 문장은 카테고리나 관련 글보다 우선되어야 합니다.</p>
</div>
</div>
</div>
<footer>구독하기 푸터와 방명록 링크</footer>
</body>
</html>`;
}

function newlineFixtureHtml(): string {
return `<!doctype html>
<html>
Expand Down Expand Up @@ -280,6 +311,44 @@ describe("webfetch", () => {
expect(text).not.toContain("tistoryTracker");
});

it("#given Tistory title chrome #when fetching markdown #then prefers the article title over site chrome", async () => {
// given
const server = await createFixtureServer((_request, response) => {
response.writeHead(200, { "content-type": "text/html; charset=utf-8" });
response.end(titlePriorityFixtureHtml());
});

// when
const result = await executeWebfetch({ url: `${server.baseUrl}/tistory-title`, format: "markdown" });
const text = textContent(result);

// then
expect(text).toContain("# 티스토리 본문을 읽어야 합니다");
expect(text).toContain("첫 번째 본문 문장은");
expect(text).toContain("두 번째 본문 문장은");
expect(text).not.toContain("블로그 이름");
expect(text).not.toContain("관련 없는 사이드바 설명");
});

it("#given Tistory title chrome #when fetching text #then prefers the article title over site chrome", async () => {
// given
const server = await createFixtureServer((_request, response) => {
response.writeHead(200, { "content-type": "text/html; charset=utf-8" });
response.end(titlePriorityFixtureHtml());
});

// when
const result = await executeWebfetch({ url: `${server.baseUrl}/tistory-title-text`, format: "text" });
const text = textContent(result);

// then
expect(text.startsWith("티스토리 본문을 읽어야 합니다")).toBe(true);
expect(text).toContain("첫 번째 본문 문장은");
expect(text).toContain("두 번째 본문 문장은");
expect(text).not.toContain("블로그 이름");
expect(text).not.toContain("관련 없는 사이드바 설명");
});

it("#given html page #when fetching text #then returns readable text without tags", async () => {
// given
const server = await createFixtureServer((_request, response) => {
Expand Down
Loading