Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 51 additions & 68 deletions src/clis/xiaohongshu/search.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ describe('xiaohongshu search', () => {
expect(cmd?.func).toBeTypeOf('function');

const page = createPageMock([
// First evaluate: early login-wall check (returns true)
true,
// First evaluate: MutationObserver wait (login wall detected)
'login_wall',
]);

await expect(cmd!.func!(page, { query: '特斯拉', limit: 5 })).rejects.toThrow(
Expand All @@ -61,21 +61,18 @@ describe('xiaohongshu search', () => {
'https://www.xiaohongshu.com/user/profile/635a9c720000000018028b40?xsec_token=user-token&xsec_source=pc_search';

const page = createPageMock([
// First evaluate: early login-wall check (returns false → no wall)
false,
// Second evaluate: main DOM extraction
{
loginWall: false,
results: [
{
title: '某鱼买FSD被坑了4万',
author: '随风',
likes: '261',
url: detailUrl,
author_url: authorUrl,
},
],
},
// First evaluate: MutationObserver wait (content appeared)
'content',
// Second evaluate: main DOM extraction (returns array directly)
[
{
title: '某鱼买FSD被坑了4万',
author: '随风',
likes: '261',
url: detailUrl,
author_url: authorUrl,
},
],
]);

const result = await cmd!.func!(page, { query: '特斯拉', limit: 1 });
Expand All @@ -101,35 +98,32 @@ describe('xiaohongshu search', () => {
expect(cmd?.func).toBeTypeOf('function');

const page = createPageMock([
// First evaluate: early login-wall check (returns false → no wall)
false,
// Second evaluate: main DOM extraction
{
loginWall: false,
results: [
{
title: 'Result A',
author: 'UserA',
likes: '10',
url: 'https://www.xiaohongshu.com/search_result/aaa',
author_url: '',
},
{
title: '',
author: 'UserB',
likes: '5',
url: 'https://www.xiaohongshu.com/search_result/bbb',
author_url: '',
},
{
title: 'Result C',
author: 'UserC',
likes: '3',
url: 'https://www.xiaohongshu.com/search_result/ccc',
author_url: '',
},
],
},
// First evaluate: MutationObserver wait (content appeared)
'content',
// Second evaluate: main DOM extraction (returns array directly)
[
{
title: 'Result A',
author: 'UserA',
likes: '10',
url: 'https://www.xiaohongshu.com/search_result/aaa',
author_url: '',
},
{
title: '',
author: 'UserB',
likes: '5',
url: 'https://www.xiaohongshu.com/search_result/bbb',
author_url: '',
},
{
title: 'Result C',
author: 'UserC',
likes: '3',
url: 'https://www.xiaohongshu.com/search_result/ccc',
author_url: '',
},
],
]);

const result = (await cmd!.func!(page, { query: '测试', limit: 1 })) as any[];
Expand All @@ -139,34 +133,23 @@ describe('xiaohongshu search', () => {
expect(result[0]).toMatchObject({ rank: 1, title: 'Result A' });
});

it('retries once when the first pass returns empty results', async () => {
it('waits for content via MutationObserver before extracting', async () => {
const cmd = getRegistry().get('xiaohongshu/search');
expect(cmd?.func).toBeTypeOf('function');

const page = createPageMock([
// First pass: login check + empty extraction
false,
{ loginWall: false, results: [] },
// Retry pass: login check + non-empty extraction
false,
{
loginWall: false,
results: [
{
title: 'Retry Result',
author: 'UserR',
likes: '9',
url: 'https://www.xiaohongshu.com/search_result/69b739f00000000000000000',
author_url: '',
},
],
},
// First evaluate: MutationObserver wait (content appeared)
'content',
// Second evaluate: extraction (returns empty array)
[],
]);

const result = (await cmd!.func!(page, { query: '测试重试', limit: 5 })) as any[];
expect(result).toHaveLength(1);
expect(result[0]).toMatchObject({ title: 'Retry Result' });
expect(page.goto).toHaveBeenCalledTimes(2);
const result = (await cmd!.func!(page, { query: '测试等待', limit: 5 })) as any[];
expect(result).toHaveLength(0);
// Only one navigation, no retry
expect(page.goto).toHaveBeenCalledTimes(1);
// Two evaluate calls: wait + extraction
expect(page.evaluate).toHaveBeenCalledTimes(2);
});
});

Expand Down
104 changes: 51 additions & 53 deletions src/clis/xiaohongshu/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,29 @@
import { cli, Strategy } from '../../registry.js';
import { AuthRequiredError } from '../../errors.js';

/**
* Wait for search results or login wall using MutationObserver (max 5s).
* Returns 'content' if note items appeared, 'login_wall' if login gate
* detected, or 'timeout' if neither appeared within the deadline.
*/
const WAIT_FOR_CONTENT_JS = `
new Promise((resolve) => {
const detect = () => {
if (document.querySelector('section.note-item')) return 'content';
if (/登录后查看搜索结果/.test(document.body?.innerText || '')) return 'login_wall';
return null;
};
const found = detect();
if (found) return resolve(found);
const observer = new MutationObserver(() => {
const result = detect();
if (result) { observer.disconnect(); resolve(result); }
});
observer.observe(document.body, { childList: true, subtree: true });
setTimeout(() => { observer.disconnect(); resolve('timeout'); }, 5000);
})
`;

/**
* Extract approximate publish date from a Xiaohongshu note URL.
* XHS note IDs follow MongoDB ObjectID format where the first 8 hex
Expand Down Expand Up @@ -39,33 +62,27 @@ cli({
columns: ['rank', 'title', 'author', 'likes', 'published_at', 'url'],
func: async (page, kwargs) => {
const keyword = encodeURIComponent(kwargs.query);
const searchUrl =
`https://www.xiaohongshu.com/search_result?keyword=${keyword}&source=web_search_result_notes`;

const fetchAttempt = async () => {
await page.goto(searchUrl);
await page.wait(3);

// Early login-wall detection: XHS may show a login gate instead of
// results. Check *before* autoScroll to avoid crashing on a page
// that has no meaningful content to scroll through.
const loginCheck = await page.evaluate(`
(() => /登录后查看搜索结果/.test(document.body?.innerText || ''))()
`);
if (loginCheck) {
throw new AuthRequiredError(
'www.xiaohongshu.com',
'Xiaohongshu search results are blocked behind a login wall',
);
}
await page.goto(
`https://www.xiaohongshu.com/search_result?keyword=${keyword}&source=web_search_result_notes`
);

// Wait for search results to render (or login wall to appear).
// Uses MutationObserver to resolve as soon as content appears,
// instead of a fixed delay + blind retry.
const waitResult = await page.evaluate(WAIT_FOR_CONTENT_JS);

if (waitResult === 'login_wall') {
throw new AuthRequiredError(
'www.xiaohongshu.com',
'Xiaohongshu search results are blocked behind a login wall',
);
}

// Scroll a couple of times to load more results
await page.autoScroll({ times: 2 });
// Scroll a couple of times to load more results
await page.autoScroll({ times: 2 });

const payload = await page.evaluate(`
const payload = await page.evaluate(`
(() => {
const loginWall = /登录后查看搜索结果/.test(document.body.innerText || '');

const normalizeUrl = (href) => {
if (!href) return '';
if (href.startsWith('http://') || href.startsWith('https://')) return href;
Expand Down Expand Up @@ -109,37 +126,18 @@ cli({
});
});

return {
loginWall,
results,
};
return results;
})()
`);

if (!payload || typeof payload !== 'object') return [];

if ((payload as any).loginWall) {
throw new AuthRequiredError('www.xiaohongshu.com', 'Xiaohongshu search results are blocked behind a login wall');
}

const data: any[] = Array.isArray((payload as any).results) ? (payload as any).results : [];
return data
.filter((item: any) => item.title)
.slice(0, kwargs.limit)
.map((item: any, i: number) => ({
rank: i + 1,
...item,
published_at: noteIdToDate(item.url),
}));
};

let results = await fetchAttempt();
if (!results.length) {
// XHS search can intermittently render blank blocks in the first paint.
// Retry once with a fresh navigation before returning empty.
await page.wait(1);
results = await fetchAttempt();
}
return results;
const data: any[] = Array.isArray(payload) ? (payload as any[]) : [];
return data
.filter((item: any) => item.title)
.slice(0, kwargs.limit)
.map((item: any, i: number) => ({
rank: i + 1,
...item,
published_at: noteIdToDate(item.url),
}));
},
});