diff --git a/src/clis/xiaohongshu/search.test.ts b/src/clis/xiaohongshu/search.test.ts index 54556965..c33c990b 100644 --- a/src/clis/xiaohongshu/search.test.ts +++ b/src/clis/xiaohongshu/search.test.ts @@ -39,8 +39,8 @@ describe('xiaohongshu search', () => { expect(cmd?.func).toBeTypeOf('function'); const page = createPageMock([ - // First evaluate: early login-wall check (returns true) - true, + // First evaluate: MutationObserver wait (login wall detected) + 'login_wall', ]); await expect(cmd!.func!(page, { query: '特斯拉', limit: 5 })).rejects.toThrow( @@ -61,21 +61,18 @@ describe('xiaohongshu search', () => { 'https://www.xiaohongshu.com/user/profile/635a9c720000000018028b40?xsec_token=user-token&xsec_source=pc_search'; const page = createPageMock([ - // First evaluate: early login-wall check (returns false → no wall) - false, - // Second evaluate: main DOM extraction - { - loginWall: false, - results: [ - { - title: '某鱼买FSD被坑了4万', - author: '随风', - likes: '261', - url: detailUrl, - author_url: authorUrl, - }, - ], - }, + // First evaluate: MutationObserver wait (content appeared) + 'content', + // Second evaluate: main DOM extraction (returns array directly) + [ + { + title: '某鱼买FSD被坑了4万', + author: '随风', + likes: '261', + url: detailUrl, + author_url: authorUrl, + }, + ], ]); const result = await cmd!.func!(page, { query: '特斯拉', limit: 1 }); @@ -101,35 +98,32 @@ describe('xiaohongshu search', () => { expect(cmd?.func).toBeTypeOf('function'); const page = createPageMock([ - // First evaluate: early login-wall check (returns false → no wall) - false, - // Second evaluate: main DOM extraction - { - loginWall: false, - results: [ - { - title: 'Result A', - author: 'UserA', - likes: '10', - url: 'https://www.xiaohongshu.com/search_result/aaa', - author_url: '', - }, - { - title: '', - author: 'UserB', - likes: '5', - url: 'https://www.xiaohongshu.com/search_result/bbb', - author_url: '', - }, - { - title: 'Result C', - author: 'UserC', - likes: '3', - url: 'https://www.xiaohongshu.com/search_result/ccc', - author_url: '', - }, - ], - }, + // First evaluate: MutationObserver wait (content appeared) + 'content', + // Second evaluate: main DOM extraction (returns array directly) + [ + { + title: 'Result A', + author: 'UserA', + likes: '10', + url: 'https://www.xiaohongshu.com/search_result/aaa', + author_url: '', + }, + { + title: '', + author: 'UserB', + likes: '5', + url: 'https://www.xiaohongshu.com/search_result/bbb', + author_url: '', + }, + { + title: 'Result C', + author: 'UserC', + likes: '3', + url: 'https://www.xiaohongshu.com/search_result/ccc', + author_url: '', + }, + ], ]); const result = (await cmd!.func!(page, { query: '测试', limit: 1 })) as any[]; @@ -139,34 +133,23 @@ describe('xiaohongshu search', () => { expect(result[0]).toMatchObject({ rank: 1, title: 'Result A' }); }); - it('retries once when the first pass returns empty results', async () => { + it('waits for content via MutationObserver before extracting', async () => { const cmd = getRegistry().get('xiaohongshu/search'); expect(cmd?.func).toBeTypeOf('function'); const page = createPageMock([ - // First pass: login check + empty extraction - false, - { loginWall: false, results: [] }, - // Retry pass: login check + non-empty extraction - false, - { - loginWall: false, - results: [ - { - title: 'Retry Result', - author: 'UserR', - likes: '9', - url: 'https://www.xiaohongshu.com/search_result/69b739f00000000000000000', - author_url: '', - }, - ], - }, + // First evaluate: MutationObserver wait (content appeared) + 'content', + // Second evaluate: extraction (returns empty array) + [], ]); - const result = (await cmd!.func!(page, { query: '测试重试', limit: 5 })) as any[]; - expect(result).toHaveLength(1); - expect(result[0]).toMatchObject({ title: 'Retry Result' }); - expect(page.goto).toHaveBeenCalledTimes(2); + const result = (await cmd!.func!(page, { query: '测试等待', limit: 5 })) as any[]; + expect(result).toHaveLength(0); + // Only one navigation, no retry + expect(page.goto).toHaveBeenCalledTimes(1); + // Two evaluate calls: wait + extraction + expect(page.evaluate).toHaveBeenCalledTimes(2); }); }); diff --git a/src/clis/xiaohongshu/search.ts b/src/clis/xiaohongshu/search.ts index a9c12807..dde9f8b8 100644 --- a/src/clis/xiaohongshu/search.ts +++ b/src/clis/xiaohongshu/search.ts @@ -9,6 +9,29 @@ import { cli, Strategy } from '../../registry.js'; import { AuthRequiredError } from '../../errors.js'; +/** + * Wait for search results or login wall using MutationObserver (max 5s). + * Returns 'content' if note items appeared, 'login_wall' if login gate + * detected, or 'timeout' if neither appeared within the deadline. + */ +const WAIT_FOR_CONTENT_JS = ` + new Promise((resolve) => { + const detect = () => { + if (document.querySelector('section.note-item')) return 'content'; + if (/登录后查看搜索结果/.test(document.body?.innerText || '')) return 'login_wall'; + return null; + }; + const found = detect(); + if (found) return resolve(found); + const observer = new MutationObserver(() => { + const result = detect(); + if (result) { observer.disconnect(); resolve(result); } + }); + observer.observe(document.body, { childList: true, subtree: true }); + setTimeout(() => { observer.disconnect(); resolve('timeout'); }, 5000); + }) +`; + /** * Extract approximate publish date from a Xiaohongshu note URL. * XHS note IDs follow MongoDB ObjectID format where the first 8 hex @@ -39,33 +62,27 @@ cli({ columns: ['rank', 'title', 'author', 'likes', 'published_at', 'url'], func: async (page, kwargs) => { const keyword = encodeURIComponent(kwargs.query); - const searchUrl = - `https://www.xiaohongshu.com/search_result?keyword=${keyword}&source=web_search_result_notes`; - - const fetchAttempt = async () => { - await page.goto(searchUrl); - await page.wait(3); - - // Early login-wall detection: XHS may show a login gate instead of - // results. Check *before* autoScroll to avoid crashing on a page - // that has no meaningful content to scroll through. - const loginCheck = await page.evaluate(` - (() => /登录后查看搜索结果/.test(document.body?.innerText || ''))() - `); - if (loginCheck) { - throw new AuthRequiredError( - 'www.xiaohongshu.com', - 'Xiaohongshu search results are blocked behind a login wall', - ); - } + await page.goto( + `https://www.xiaohongshu.com/search_result?keyword=${keyword}&source=web_search_result_notes` + ); + + // Wait for search results to render (or login wall to appear). + // Uses MutationObserver to resolve as soon as content appears, + // instead of a fixed delay + blind retry. + const waitResult = await page.evaluate(WAIT_FOR_CONTENT_JS); + + if (waitResult === 'login_wall') { + throw new AuthRequiredError( + 'www.xiaohongshu.com', + 'Xiaohongshu search results are blocked behind a login wall', + ); + } - // Scroll a couple of times to load more results - await page.autoScroll({ times: 2 }); + // Scroll a couple of times to load more results + await page.autoScroll({ times: 2 }); - const payload = await page.evaluate(` + const payload = await page.evaluate(` (() => { - const loginWall = /登录后查看搜索结果/.test(document.body.innerText || ''); - const normalizeUrl = (href) => { if (!href) return ''; if (href.startsWith('http://') || href.startsWith('https://')) return href; @@ -109,37 +126,18 @@ cli({ }); }); - return { - loginWall, - results, - }; + return results; })() `); - if (!payload || typeof payload !== 'object') return []; - - if ((payload as any).loginWall) { - throw new AuthRequiredError('www.xiaohongshu.com', 'Xiaohongshu search results are blocked behind a login wall'); - } - - const data: any[] = Array.isArray((payload as any).results) ? (payload as any).results : []; - return data - .filter((item: any) => item.title) - .slice(0, kwargs.limit) - .map((item: any, i: number) => ({ - rank: i + 1, - ...item, - published_at: noteIdToDate(item.url), - })); - }; - - let results = await fetchAttempt(); - if (!results.length) { - // XHS search can intermittently render blank blocks in the first paint. - // Retry once with a fresh navigation before returning empty. - await page.wait(1); - results = await fetchAttempt(); - } - return results; + const data: any[] = Array.isArray(payload) ? (payload as any[]) : []; + return data + .filter((item: any) => item.title) + .slice(0, kwargs.limit) + .map((item: any, i: number) => ({ + rank: i + 1, + ...item, + published_at: noteIdToDate(item.url), + })); }, });