From ddf15e46263ade507d7693b13dc18ef230a3b40c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B3=BD=E5=8A=A0=E6=AD=A6?= Date: Wed, 1 Apr 2026 07:12:39 +0800 Subject: [PATCH 1/3] feat(1688): add browser adapter and docs --- README.md | 1 + README.zh-CN.md | 1 + docs/.vitepress/config.mts | 1 + docs/adapters/browser/1688.md | 50 ++++ docs/adapters/index.md | 1 + docs/developer/testing.md | 2 + docs/guide/troubleshooting.md | 11 + src/clis/1688/item.test.ts | 68 +++++ src/clis/1688/item.ts | 280 ++++++++++++++++++ src/clis/1688/search.test.ts | 52 ++++ src/clis/1688/search.ts | 302 +++++++++++++++++++ src/clis/1688/shared.test.ts | 56 ++++ src/clis/1688/shared.ts | 527 ++++++++++++++++++++++++++++++++++ src/clis/1688/store.test.ts | 63 ++++ src/clis/1688/store.ts | 260 +++++++++++++++++ 15 files changed, 1675 insertions(+) create mode 100644 docs/adapters/browser/1688.md create mode 100644 src/clis/1688/item.test.ts create mode 100644 src/clis/1688/item.ts create mode 100644 src/clis/1688/search.test.ts create mode 100644 src/clis/1688/search.ts create mode 100644 src/clis/1688/shared.test.ts create mode 100644 src/clis/1688/shared.ts create mode 100644 src/clis/1688/store.test.ts create mode 100644 src/clis/1688/store.ts diff --git a/README.md b/README.md index b4492f2a..c6cab43e 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,7 @@ git clone git@github.com:jackwener/opencli.git && cd opencli && npm install && n | **twitter** | `trending` `search` `timeline` `bookmarks` `post` `download` `profile` `article` `like` `likes` `notifications` `reply` `reply-dm` `thread` `follow` `unfollow` `followers` `following` `block` `unblock` `bookmark` `unbookmark` `delete` `hide-reply` `accept` | | **reddit** | `hot` `frontpage` `popular` `search` `subreddit` `user` `user-posts` `user-comments` `read` `save` `saved` `subscribe` `upvote` `upvoted` `comment` | | **amazon** | `bestsellers` `search` `product` `offer` `discussion` | +| **1688** | `search` `item` `store` | | **gemini** | `new` `ask` `image` | | **notebooklm** | `status` `list` `open` `select` `current` `get` `metadata` `source-list` `source-get` `source-fulltext` `source-guide` `history` `note-list` `notes-list` `notes-get` `summary` | | **spotify** | `auth` `status` `play` `pause` `next` `prev` `volume` `search` `queue` `shuffle` `repeat` | diff --git a/README.zh-CN.md b/README.zh-CN.md index 6f5d7d3d..e7a46e03 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -191,6 +191,7 @@ npx skills add jackwener/opencli --skill opencli-oneshot # 快速命令参 | **facebook** | `feed` `profile` `search` `friends` `groups` `events` `notifications` `memories` `add-friend` `join-group` | 浏览器 | | **google** | `news` `search` `suggest` `trends` | 公开 | | **amazon** | `bestsellers` `search` `product` `offer` `discussion` | 浏览器 | +| **1688** | `search` `item` `store` | 浏览器 | | **gemini** | `new` `ask` `image` | 浏览器 | | **spotify** | `auth` `status` `play` `pause` `next` `prev` `volume` `search` `queue` `shuffle` `repeat` | OAuth API | | **notebooklm** | `status` `list` `open` `select` `current` `get` `metadata` `source-list` `source-get` `source-fulltext` `source-guide` `history` `note-list` `notes-list` `notes-get` `summary` | 浏览器 | diff --git a/docs/.vitepress/config.mts b/docs/.vitepress/config.mts index 45da8876..7f68f8ea 100644 --- a/docs/.vitepress/config.mts +++ b/docs/.vitepress/config.mts @@ -73,6 +73,7 @@ export default defineConfig({ { text: 'Chaoxing', link: '/adapters/browser/chaoxing' }, { text: 'Grok', link: '/adapters/browser/grok' }, { text: 'Amazon', link: '/adapters/browser/amazon' }, + { text: '1688', link: '/adapters/browser/1688' }, { text: 'Gemini', link: '/adapters/browser/gemini' }, { text: 'NotebookLM', link: '/adapters/browser/notebooklm' }, { text: 'WeRead', link: '/adapters/browser/weread' }, diff --git a/docs/adapters/browser/1688.md b/docs/adapters/browser/1688.md new file mode 100644 index 00000000..53364f56 --- /dev/null +++ b/docs/adapters/browser/1688.md @@ -0,0 +1,50 @@ +# 1688 + +**Mode**: 🔐 Browser · **Domain**: `1688.com` + +## Commands + +| Command | Description | +|---------|-------------| +| `opencli 1688 search ""` | Search public product candidates with price, MOQ, seller link, and visible badges | +| `opencli 1688 item ` | Read a public product detail page with price tiers, MOQ, delivery text, and seller basics | +| `opencli 1688 store ` | Read a public supplier/store page with company info, years on platform, categories, and visible service signals | + +## Usage Examples + +```bash +# Search products +opencli 1688 search "桌面置物架 宿舍 收纳" --limit 10 + +# JSON output +opencli 1688 search "桌面置物架 宿舍 收纳" --limit 10 -f json + +# Read an item by offer id +opencli 1688 item 841141931191 -f json + +# Read an item by URL +opencli 1688 item https://detail.1688.com/offer/841141931191.html -f json + +# Read a supplier store +opencli 1688 store https://shop52908bfw19166.1688.com/ -f json + +# Read a supplier by member id +opencli 1688 store b2b-22154705262941f196 -f json +``` + +## Prerequisites + +- Chrome running and **logged into** `1688.com` +- [Browser Bridge extension](/guide/browser-bridge) installed + +## Notes + +- This adapter only returns fields visible on public pages. It does not send inquiries, place orders, or access seller back office data. +- Prefer stable identifiers such as `offer_id`, `member_id`, and `shop_id` for follow-up workflows. +- `item` can be more sensitive to the active browser target than `search` or `store`. + +## Troubleshooting + +- If `opencli 1688 item` reports `did not expose product context`, first make sure the open page is a real `detail.1688.com` item page. +- If the browser target is too broad, retry with `OPENCLI_CDP_TARGET=detail.1688.com`. +- If you hit a slider or verification page, refresh the real page in Chrome and retry. diff --git a/docs/adapters/index.md b/docs/adapters/index.md index 5b5b4cc2..4e7c1611 100644 --- a/docs/adapters/index.md +++ b/docs/adapters/index.md @@ -45,6 +45,7 @@ Run `opencli list` for the live registry. | **[google](./browser/google)** | `news` `search` `suggest` `trends` | 🌐 / 🔐 | | **[jd](./browser/jd)** | `item` | 🔐 Browser | | **[amazon](./browser/amazon)** | `bestsellers` `search` `product` `offer` `discussion` | 🔐 Browser | +| **[1688](./browser/1688)** | `search` `item` `store` | 🔐 Browser | | **[web](./browser/web)** | `read` | 🔐 Browser | | **[weixin](./browser/weixin)** | `download` | 🔐 Browser | | **[36kr](./browser/36kr)** | `news` `hot` `search` `article` | 🌐 / 🔐 | diff --git a/docs/developer/testing.md b/docs/developer/testing.md index 730b6f30..f6712d57 100644 --- a/docs/developer/testing.md +++ b/docs/developer/testing.md @@ -131,6 +131,8 @@ npx vitest src/ - `browser-public.test.ts` 使用 `tryBrowserCommand()`,站点反爬或地域限制导致空数据时会 warn + pass - `browser-auth.test.ts` 验证 **graceful failure**,重点是不 crash、不 hang、错误信息可控 - 如需测试完整登录态,保持 Chrome 登录态并安装 Browser Bridge 扩展,再手动运行对应测试 +- 对依赖具体 host 页面上下文的 browser adapter,除了单测外,还应手动验证真实命令,并把必要的 target host 约束写进 adapter docs / troubleshooting +- 对会主动导航页面的 browser commands,手动验证时优先串行执行;多个 CLI 进程同时连到同一个 CDP target 可能互相覆盖导航,制造假的 adapter 故障 --- diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md index 4b352c9f..9cb7b528 100644 --- a/docs/guide/troubleshooting.md +++ b/docs/guide/troubleshooting.md @@ -12,6 +12,17 @@ - Your login session in Chrome might have expired. Open a normal Chrome tab, navigate to the target site, and log in or refresh the page. - Some sites have geographic restrictions (e.g., Bilibili, Zhihu from outside China). +### Browser command opens the page but still cannot read context + +- A healthy Browser Bridge connection does not guarantee that the current page target exposes the data your adapter expects. +- Some browser adapters are sensitive to the active host or page context. +- Example: `opencli 1688 item` may fail with `did not expose product context` if the target is too broad. +- Retry on a real item page, refresh the page in Chrome, and if needed narrow the target, for example: + +```bash +OPENCLI_CDP_TARGET=detail.1688.com opencli 1688 item 841141931191 -f json +``` + ### Node API errors - Make sure you are using **Node.js >= 20**. Some dependencies require modern Node APIs. diff --git a/src/clis/1688/item.test.ts b/src/clis/1688/item.test.ts new file mode 100644 index 00000000..9e9394d2 --- /dev/null +++ b/src/clis/1688/item.test.ts @@ -0,0 +1,68 @@ +import { describe, expect, it } from 'vitest'; +import { __test__ } from './item.js'; + +describe('1688 item normalization', () => { + it('normalizes public item payload into reportable fields', () => { + const result = __test__.normalizeItemPayload({ + href: 'https://detail.1688.com/offer/887904326744.html', + title: '法式春季长袖开衫连衣裙女新款大码女装碎花吊带裙套装142077 - 阿里巴巴', + bodyText: ` + 青岛沁澜衣品服装有限公司 + 入驻13年 + 主营:大码女装 + 店铺回头率 + 87% + 山东青岛 + 3套起批 + 已售1600+套 + 支持定制logo + `, + offerTitle: '法式春季长袖开衫连衣裙女新款大码女装碎花吊带裙套装142077', + offerId: 887904326744, + seller: { + companyName: '青岛沁澜衣品服装有限公司', + memberId: 'b2b-1641351767', + winportUrl: 'https://yinuoweierfushi.1688.com', + }, + trade: { + beginAmount: 3, + priceDisplay: '96.00-98.00', + unit: '套', + saleCount: 1655, + offerIDatacenterSellInfo: { + 面料名称: '莫代尔', + 主面料成分: '莫代尔纤维', + sellPointModel: '{"ignore":true}', + }, + offerPriceModel: { + currentPrices: [ + { beginAmount: 3, price: '98.00' }, + { beginAmount: 50, price: '97.00' }, + ], + }, + }, + gallery: { + mainImage: ['https://example.com/1.jpg'], + offerImgList: ['https://example.com/2.jpg'], + wlImageInfos: [{ fullPathImageURI: 'https://example.com/3.jpg' }], + }, + services: [ + { serviceName: '延期必赔', agreeDeliveryHours: 360 }, + { serviceName: '品质保障' }, + ], + }); + + expect(result.offer_id).toBe('887904326744'); + expect(result.member_id).toBe('b2b-1641351767'); + expect(result.shop_id).toBe('yinuoweierfushi'); + expect(result.price_text).toBe('¥96.00-98.00'); + expect(result.moq_text).toBe('3套起批'); + expect(result.origin_place).toBe('山东青岛'); + expect(result.delivery_days_text).toBe('360小时内发货'); + expect(result.private_label_text).toBe('支持定制logo'); + expect(result.visible_attributes).toEqual({ + 面料名称: '莫代尔', + 主面料成分: '莫代尔纤维', + }); + }); +}); diff --git a/src/clis/1688/item.ts b/src/clis/1688/item.ts new file mode 100644 index 00000000..1db98bef --- /dev/null +++ b/src/clis/1688/item.ts @@ -0,0 +1,280 @@ +import { CommandExecutionError } from '../../errors.js'; +import { cli, Strategy } from '../../registry.js'; +import type { IPage } from '../../types.js'; +import { isRecord } from '../../utils.js'; +import { + assertNotCaptcha, + buildCaptchaHint, + buildDetailUrl, + buildProvenance, + cleanMultilineText, + cleanText, + extractLocation, + extractMemberId, + extractOfferId, + extractShopId, + gotoAndReadState, + normalizePriceTiers, + parseMoqText, + parsePriceText, + toNumber, + uniqueNonEmpty, +} from './shared.js'; + +interface BuyerProtectionModel { + serviceName?: string; + shortBuyerDesc?: string; + packageBuyerDesc?: string; + textDesc?: string; + agreeDeliveryHours?: number; +} + +interface ItemBrowserPayload { + href?: string; + title?: string; + bodyText?: string; + offerTitle?: string; + offerId?: string | number; + seller?: { + companyName?: string; + memberId?: string; + winportUrl?: string; + sellerWinportUrlMap?: Record; + }; + trade?: { + beginAmount?: string | number; + priceDisplay?: string; + unit?: string; + saleCount?: string | number; + offerIDatacenterSellInfo?: Record; + offerPriceModel?: { + currentPrices?: Array<{ beginAmount?: string | number; price?: string | number }>; + }; + }; + gallery?: { + mainImage?: string[]; + offerImgList?: string[]; + wlImageInfos?: Array<{ fullPathImageURI?: string }>; + }; + shipping?: { + deliveryLimitText?: string; + logisticsText?: string; + protectionInfos?: BuyerProtectionModel[]; + buyerProtectionModel?: BuyerProtectionModel[]; + }; + services?: BuyerProtectionModel[]; +} + +function normalizeItemPayload(payload: ItemBrowserPayload): Record { + const href = cleanText(payload.href); + const bodyText = cleanMultilineText(payload.bodyText); + const sellerName = cleanText(payload.seller?.companyName); + const sellerUrl = cleanText( + payload.seller?.winportUrl + ?? payload.seller?.sellerWinportUrlMap?.defaultUrl + ?? payload.seller?.sellerWinportUrlMap?.indexUrl, + ); + const offerId = cleanText(String(payload.offerId ?? '')) || extractOfferId(href) || ''; + const memberId = cleanText(payload.seller?.memberId) || extractMemberId(href) || null; + const shopId = extractShopId(sellerUrl) ?? extractShopId(href); + const unit = cleanText(payload.trade?.unit); + const priceDisplay = cleanText(payload.trade?.priceDisplay); + const priceRange = parsePriceText(priceDisplay ? `¥${priceDisplay}` : bodyText); + const moqText = extractMoqText(bodyText, payload.trade?.beginAmount, unit); + const moq = parseMoqText(moqText); + const services = uniqueServices(payload); + const serviceBadges = uniqueNonEmpty(services.map((service) => cleanText(service.serviceName))); + const attributes = normalizeVisibleAttributes(payload.trade?.offerIDatacenterSellInfo); + + const detailUrl = offerId ? buildDetailUrl(offerId) : href; + const provenance = buildProvenance(href || detailUrl); + const priceTiers = normalizePriceTiers(payload.trade?.offerPriceModel?.currentPrices ?? [], unit || null); + const images = uniqueNonEmpty([ + ...(payload.gallery?.mainImage ?? []), + ...(payload.gallery?.offerImgList ?? []), + ...((payload.gallery?.wlImageInfos ?? []).map((item) => item.fullPathImageURI ?? '')), + ]); + + return { + offer_id: offerId, + member_id: memberId, + shop_id: shopId, + title: cleanText(payload.offerTitle) || stripAlibabaSuffix(payload.title) || firstNonEmptyLine(bodyText), + item_url: detailUrl, + ...provenance, + main_images: images, + price_text: priceRange.price_text, + price_tiers: priceTiers, + currency: priceRange.currency ?? 'CNY', + moq_text: moq.moq_text, + moq_value: moq.moq_value, + seller_name: sellerName || null, + seller_url: sellerUrl || null, + shop_name: sellerName || null, + origin_place: extractLocation(bodyText), + delivery_days_text: extractDeliveryDaysText(bodyText, services, payload.shipping), + customization_text: extractKeywordLine(bodyText, ['来样定制', '来图定制', '支持定制', '可定制', '定制']), + private_label_text: extractKeywordLine(bodyText, ['贴牌', '贴标', '定制logo', '打logo', 'OEM', 'ODM']), + visible_attributes: attributes, + sales_text: extractSalesText(bodyText), + service_badges: serviceBadges, + stock_quantity: extractStockQuantity(bodyText), + }; +} + +function normalizeVisibleAttributes(raw: unknown): Record { + if (!isRecord(raw)) return {}; + const entries = Object.entries(raw) + .filter(([key, value]) => key !== 'sellPointModel' && cleanText(key) && cleanText(String(value))) + .map(([key, value]) => [cleanText(key), cleanText(String(value))] as const); + return Object.fromEntries(entries); +} + +function uniqueServices(payload: ItemBrowserPayload): BuyerProtectionModel[] { + const combined = [ + ...(Array.isArray(payload.services) ? payload.services : []), + ...(Array.isArray(payload.shipping?.protectionInfos) ? payload.shipping.protectionInfos : []), + ...(Array.isArray(payload.shipping?.buyerProtectionModel) ? payload.shipping.buyerProtectionModel : []), + ]; + + const seen = new Set(); + const result: BuyerProtectionModel[] = []; + for (const service of combined) { + const key = cleanText(service.serviceName); + if (!key || seen.has(key)) continue; + seen.add(key); + result.push(service); + } + return result; +} + +function stripAlibabaSuffix(title: string | undefined): string { + return cleanText(title).replace(/\s*-\s*阿里巴巴$/, '').trim(); +} + +function firstNonEmptyLine(text: string): string { + return text.split('\n').map((line) => cleanText(line)).find(Boolean) ?? ''; +} + +function extractMoqText(bodyText: string, beginAmount: string | number | undefined, unit: string): string { + const lineMatch = bodyText.match(/\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)\s*起批/); + if (lineMatch) return lineMatch[0]; + + const moqValue = toNumber(beginAmount); + if (moqValue !== null) { + return `${moqValue}${unit || ''}起批`; + } + + return ''; +} + +function extractDeliveryDaysText( + bodyText: string, + services: BuyerProtectionModel[], + shipping: ItemBrowserPayload['shipping'], +): string | null { + const shippingText = cleanText(shipping?.deliveryLimitText) || cleanText(shipping?.logisticsText); + if (shippingText) return shippingText; + + const textMatch = bodyText.match(/\d+\s*(?:小时|天)(?:内)?发货/); + if (textMatch) return textMatch[0]; + + const hourMatch = services.find((service) => typeof service.agreeDeliveryHours === 'number'); + if (hourMatch && typeof hourMatch.agreeDeliveryHours === 'number') { + return `${hourMatch.agreeDeliveryHours}小时内发货`; + } + + return null; +} + +function extractKeywordLine(bodyText: string, keywords: string[]): string | null { + const lines = bodyText.split('\n').map((line) => cleanText(line)).filter(Boolean); + for (const line of lines) { + if (keywords.some((keyword) => line.includes(keyword))) { + return line; + } + } + return null; +} + +function extractSalesText(bodyText: string): string | null { + const match = bodyText.match(/(?:全网销量|已售)\s*\d+(?:\.\d+)?\+?[件套个]?/); + return match ? cleanText(match[0]) : null; +} + +function extractStockQuantity(bodyText: string): number | null { + const match = bodyText.match(/库存\s*(\d+)/); + return match ? Number.parseInt(match[1], 10) : null; +} + +async function readItemPayload(page: IPage, itemUrl: string): Promise { + let state = await gotoAndReadState(page, itemUrl, 2500, 'item'); + if (state.href && !state.href.includes('/offer/')) { + assertNotCaptcha(state, 'item'); + } + + const payload = await page.evaluate(` + (() => { + const root = window.context ?? {}; + const model = root.result?.global?.globalData?.model ?? null; + const toJson = (value) => JSON.parse(JSON.stringify(value ?? null)); + return { + href: window.location.href, + title: document.title || '', + bodyText: document.body ? document.body.innerText || '' : '', + offerTitle: model?.offerTitleModel?.subject ?? '', + offerId: model?.tradeModel?.offerId ?? '', + seller: toJson(model?.sellerModel), + trade: toJson(model?.tradeModel), + gallery: toJson(root.result?.data?.gallery?.fields ?? null), + shipping: toJson(root.result?.data?.shippingServices?.fields ?? null), + services: toJson(root.result?.data?.shippingServices?.fields?.protectionInfos ?? []), + }; + })() + `) as ItemBrowserPayload; + + if (!cleanText(String(payload.offerId ?? ''))) { + state = await gotoAndReadState(page, itemUrl, 2500, 'item'); + assertNotCaptcha(state, 'item'); + throw new CommandExecutionError( + '1688 item page did not expose product context', + `${buildCaptchaHint('item')} If the page is still open but blank, refresh the item page in Chrome and retry.`, + ); + } + + return payload; +} + +cli({ + site: '1688', + name: 'item', + description: '1688 商品详情(公开商品字段、价格阶梯、卖家基础信息)', + domain: 'www.1688.com', + strategy: Strategy.COOKIE, + navigateBefore: false, + args: [ + { + name: 'input', + required: true, + positional: true, + help: '1688 商品 URL 或 offer ID(如 887904326744)', + }, + ], + columns: ['offer_id', 'title', 'price_text', 'moq_text', 'seller_name', 'origin_place'], + func: async (page, kwargs) => { + const itemUrl = buildDetailUrl(String(kwargs.input ?? '')); + const payload = await readItemPayload(page, itemUrl); + return [normalizeItemPayload(payload)]; + }, +}); + +export const __test__ = { + normalizeItemPayload, + normalizeVisibleAttributes, + stripAlibabaSuffix, + extractMoqText, + extractDeliveryDaysText, + extractKeywordLine, + extractSalesText, + extractStockQuantity, +}; diff --git a/src/clis/1688/search.test.ts b/src/clis/1688/search.test.ts new file mode 100644 index 00000000..e204480c --- /dev/null +++ b/src/clis/1688/search.test.ts @@ -0,0 +1,52 @@ +import { describe, expect, it } from 'vitest'; +import { __test__ } from './search.js'; + +describe('1688 search normalization', () => { + it('normalizes search candidates into structured result rows', () => { + const result = __test__.normalizeSearchCandidate({ + item_url: 'https://detail.1688.com/offer/887904326744.html', + title: '宿舍置物架桌面加高架', + container_text: '宿舍置物架桌面加高架 ¥56.00 2套起批 山东青岛 已售300+套', + price_text: '¥ 56 .00', + sales_text: '300+套', + moq_text: '2套起批', + tag_items: ['退货包运费', '回头率52%'], + hover_items: ['验厂报告'], + seller_name: '青岛沁澜衣品服装有限公司', + seller_url: 'https://yinuoweierfushi.1688.com', + }, 1, 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=置物架'); + + expect(result.rank).toBe(1); + expect(result.offer_id).toBe('887904326744'); + expect(result.shop_id).toBe('yinuoweierfushi'); + expect(result.price_text).toBe('¥56.00'); + expect(result.price_min).toBe(56); + expect(result.price_max).toBe(56); + expect(result.moq_value).toBe(2); + expect(result.location).toBe('山东青岛'); + expect(result.sales_text).toBe('300+套'); + expect(result.badges).toEqual(expect.arrayContaining(['退货包运费', '验厂报告'])); + expect(result.return_rate_text).toBe('回头率52%'); + }); + + it('extracts offer id from mobile detail search links', () => { + const result = __test__.normalizeSearchCandidate({ + item_url: 'http://detail.m.1688.com/page/index.html?offerId=910933345396&sortType=&pageId=', + title: '', + container_text: '桌面书桌办公室工位收纳展示新中式博古架多层茶具厨房摆放置物架 ¥24.3 已售20+件', + price_text: '¥ 14 .28', + sales_text: '1500+件', + moq_text: '≥2个', + seller_name: '泰商国际贸易(宁阳)有限公司', + seller_url: 'http://tsgjmy.1688.com/', + }, 1, 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=桌面置物架'); + + expect(result.offer_id).toBe('910933345396'); + expect(result.shop_id).toBe('tsgjmy'); + expect(result.title).toContain('桌面书桌办公室工位收纳展示'); + expect(result.price_text).toBe('¥14.28'); + expect(result.sales_text).toBe('1500+件'); + expect(result.moq_text).toBe('≥2个'); + expect(result.moq_value).toBe(2); + }); +}); diff --git a/src/clis/1688/search.ts b/src/clis/1688/search.ts new file mode 100644 index 00000000..87976a84 --- /dev/null +++ b/src/clis/1688/search.ts @@ -0,0 +1,302 @@ +import { CommandExecutionError } from '../../errors.js'; +import { cli, Strategy } from '../../registry.js'; +import type { IPage } from '../../types.js'; +import { + FACTORY_BADGE_PATTERNS, + SERVICE_BADGE_PATTERNS, + assertNotCaptcha, + buildProvenance, + buildSearchUrl, + cleanText, + extractBadges, + extractLocation, + extractMemberId, + extractOfferId, + extractShopId, + gotoAndReadState, + limitCandidates, + parseMoqText, + parsePriceText, + uniqueNonEmpty, +} from './shared.js'; + +interface SearchPayload { + href?: string; + title?: string; + bodyText?: string; + candidates?: Array<{ + item_url?: string; + title?: string; + container_text?: string; + desc_rows?: string[]; + price_text?: string | null; + sales_text?: string | null; + hover_price_text?: string | null; + moq_text?: string | null; + tag_items?: string[]; + hover_items?: string[]; + seller_name?: string | null; + seller_url?: string | null; + }>; +} + +const SEARCH_ITEM_URL_PATTERNS = [ + 'detail.1688.com/offer/', + 'detail.m.1688.com/page/index.html?offerId=', +]; + +function normalizeSearchCandidate( + candidate: NonNullable[number], + rank: number, + sourceUrl: string, +): Record { + const itemUrl = cleanText(candidate.item_url); + const containerText = cleanText(candidate.container_text); + const priceText = firstNonEmpty([ + normalizeInlineText(candidate.price_text), + normalizeInlineText(extractPriceText(candidate.hover_price_text)), + ]); + const priceRange = parsePriceText(priceText || containerText); + const moq = parseMoqText(firstNonEmpty([ + normalizeInlineText(candidate.moq_text), + normalizeInlineText(extractMoqText(candidate.hover_price_text)), + normalizeInlineText(extractMoqText(containerText)), + ])); + const sellerUrl = cleanText(candidate.seller_url); + const evidenceText = uniqueNonEmpty([ + containerText, + ...(candidate.desc_rows ?? []), + ...(candidate.tag_items ?? []), + ...(candidate.hover_items ?? []), + ]).join('\n'); + const badges = extractBadges(evidenceText, [...FACTORY_BADGE_PATTERNS, ...SERVICE_BADGE_PATTERNS]); + const salesText = firstNonEmpty([ + extractSalesText(candidate.sales_text), + extractSalesText(containerText) ?? '', + ]) || null; + + return { + rank, + offer_id: extractOfferId(itemUrl), + member_id: extractMemberId(sellerUrl), + shop_id: extractShopId(sellerUrl), + title: cleanText(candidate.title) || firstLine(containerText), + source_url: sourceUrl, + fetched_at: new Date().toISOString(), + strategy: 'cookie', + price_text: priceRange.price_text, + price_min: priceRange.price_min, + price_max: priceRange.price_max, + currency: priceRange.currency ?? 'CNY', + moq_text: moq.moq_text, + moq_value: moq.moq_value, + seller_name: cleanText(candidate.seller_name) || null, + seller_url: sellerUrl || null, + item_url: itemUrl, + location: extractLocation(containerText), + badges, + sales_text: salesText, + return_rate_text: extractReturnRateText(candidate.tag_items ?? []), + }; +} + +function extractMoqText(text: string | null | undefined): string { + const normalized = normalizeInlineText(text); + return normalized.match(/\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)\s*起批/i)?.[0] + ?? normalized.match(/≥\s*\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)?/i)?.[0] + ?? normalized.match(/\d+(?:\.\d+)?\s*(?:~|-|至|到)\s*\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)/i)?.[0] + ?? ''; +} + +function extractPriceText(text: string | null | undefined): string { + const normalized = normalizeInlineText(text); + return normalized.match(/[¥$€]\s*\d+(?:\.\d+)?/)?.[0] ?? ''; +} + +function extractSalesText(text: string | null | undefined): string | null { + const normalized = normalizeInlineText(text); + if (!normalized) return null; + if (/^\d+(?:\.\d+)?\+?\s*(件|套|个|单)$/.test(normalized)) { + return normalized; + } + const match = normalized.match(/(?:已售|销量|售)\s*\d+(?:\.\d+)?\+?\s*(件|套|个|单)?/); + return match ? cleanText(match[0]) : null; +} + +function firstLine(text: string): string { + return text.split(/\s+/).find(Boolean) ?? ''; +} + +function firstNonEmpty(values: Array): string { + return values.map((value) => cleanText(value)).find(Boolean) ?? ''; +} + +function normalizeInlineText(text: string | null | undefined): string { + return cleanText(text) + .replace(/([¥$€])\s+(?=\d)/g, '$1') + .replace(/(\d)\s*\.\s*(\d)/g, '$1.$2') + .replace(/\s*([~-])\s*/g, '$1') + .trim(); +} + +function extractReturnRateText(values: string[]): string | null { + return uniqueNonEmpty(values.map((value) => normalizeInlineText(value))) + .find((value) => /^回头率\s*\d+(?:\.\d+)?%$/.test(value)) + ?? null; +} + +async function readSearchPayload(page: IPage, query: string): Promise { + const url = buildSearchUrl(query); + const state = await gotoAndReadState(page, url, 2500, 'search'); + assertNotCaptcha(state, 'search'); + + return await page.evaluate(` + (() => { + const normalizeText = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const isItemHref = (href) => ${JSON.stringify(SEARCH_ITEM_URL_PATTERNS)}.some((pattern) => (href || '').includes(pattern)); + const uniqueTexts = (values) => [...new Set(values.map((value) => normalizeText(value)).filter(Boolean))]; + const collectTexts = (root, selector) => uniqueTexts( + Array.from(root.querySelectorAll(selector)).map((node) => node.innerText || node.textContent || ''), + ); + const firstText = (root, selectors) => { + for (const selector of selectors) { + const node = root.querySelector(selector); + const value = normalizeText(node ? node.innerText || node.textContent || '' : ''); + if (value) return value; + } + return ''; + }; + const findMoqText = (values, priceText) => { + const moqPattern = /(≥\\s*\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只)?)|(\\d+(?:\\.\\d+)?\\s*(?:~|-|至|到)\\s*\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只))|(\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只)\\s*起批)/i; + return values.find((value) => moqPattern.test(value)) + || normalizeText(priceText).match(moqPattern)?.[0] + || ''; + }; + const isSellerHref = (href) => { + if (!href) return false; + try { + const url = new URL(href, window.location.href); + const host = url.hostname || ''; + if (!host.endsWith('.1688.com')) return false; + if (host === 's.1688.com' || host === 'r.1688.com' || host === 'air.1688.com' || host === 'detail.1688.com' || host === 'detail.m.1688.com' || host === 'dj.1688.com') { + return false; + } + return true; + } catch { + return false; + } + }; + const collectCandidates = () => { + const anchors = Array.from(document.querySelectorAll('a')).filter((anchor) => isItemHref(anchor.href || '')); + const seen = new Set(); + const items = []; + + const pickContainer = (anchor) => { + let node = anchor; + while (node && node !== document.body) { + const text = normalizeText(node.innerText || node.textContent || ''); + if (text.length >= 40 && text.length <= 2000) { + return node; + } + node = node.parentElement; + } + return anchor; + }; + + for (const anchor of anchors) { + const href = anchor.href || ''; + if (!href || seen.has(href)) continue; + seen.add(href); + + const container = pickContainer(anchor); + const tagItems = collectTexts(container, '.offer-tag-row .offer-desc-item'); + const hoverItems = collectTexts(container, '.offer-hover-wrapper .offer-desc-item'); + const sellerAnchor = Array.from(container.querySelectorAll('a')) + .find((link) => isSellerHref(link.href || '')); + const hoverPriceText = firstText(container, [ + '.offer-hover-wrapper .hover-price-item', + '.offer-hover-wrapper .price-item', + ]); + + items.push({ + item_url: href, + title: firstText(container, ['.offer-title-row .title-text', '.offer-title-row']) + || normalizeText(anchor.innerText || anchor.textContent || ''), + container_text: normalizeText(container.innerText || container.textContent || ''), + desc_rows: collectTexts(container, '.offer-desc-row'), + price_text: firstText(container, ['.offer-price-row .price-item']), + sales_text: firstText(container, ['.offer-price-row .col-desc_after', '.offer-desc-row .col-desc_after']), + hover_price_text: hoverPriceText, + moq_text: findMoqText(hoverItems, hoverPriceText), + tag_items: tagItems, + hover_items: hoverItems, + seller_name: sellerAnchor ? normalizeText(sellerAnchor.innerText || sellerAnchor.textContent || '') : null, + seller_url: sellerAnchor ? sellerAnchor.href : null, + }); + } + + return items; + }; + + return { + href: window.location.href, + title: document.title || '', + bodyText: document.body ? document.body.innerText || '' : '', + candidates: collectCandidates(), + }; + })() + `) as SearchPayload; +} + +cli({ + site: '1688', + name: 'search', + description: '1688 商品搜索(结果候选、卖家链接、价格/MOQ/销量文本)', + domain: 'www.1688.com', + strategy: Strategy.COOKIE, + navigateBefore: false, + args: [ + { + name: 'query', + required: true, + positional: true, + help: '搜索关键词,如 "置物架"', + }, + { + name: 'limit', + type: 'int', + default: 20, + help: '结果数量上限(默认 20)', + }, + ], + columns: ['rank', 'title', 'price_text', 'moq_text', 'seller_name', 'location'], + func: async (page, kwargs) => { + const query = String(kwargs.query ?? ''); + const limit = Math.max(1, Number(kwargs.limit) || 20); + const payload = await readSearchPayload(page, query); + const sourceUrl = cleanText(payload.href) || buildSearchUrl(query); + const candidates = limitCandidates(payload.candidates ?? [], limit) + .filter((candidate) => cleanText(candidate.item_url)); + + if (candidates.length === 0) { + throw new CommandExecutionError( + '1688 search did not expose any result cards', + 'The search page likely hit a slider challenge or changed its DOM. Open the same query in Chrome, solve any challenge, keep a clean 1688 tab selected, and retry.', + ); + } + + const provenance = buildProvenance(sourceUrl); + return candidates.map((candidate, index) => ({ + ...normalizeSearchCandidate(candidate, index + 1, sourceUrl), + fetched_at: provenance.fetched_at, + strategy: provenance.strategy, + })); + }, +}); + +export const __test__ = { + normalizeSearchCandidate, + extractMoqText, + extractSalesText, + firstLine, +}; diff --git a/src/clis/1688/shared.test.ts b/src/clis/1688/shared.test.ts new file mode 100644 index 00000000..30a70b78 --- /dev/null +++ b/src/clis/1688/shared.test.ts @@ -0,0 +1,56 @@ +import { describe, expect, it } from 'vitest'; +import { __test__ } from './shared.js'; + +describe('1688 shared helpers', () => { + it('builds encoded search URLs', () => { + expect(__test__.buildSearchUrl('置物架')).toBe( + 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=%E7%BD%AE%E7%89%A9%E6%9E%B6', + ); + }); + + it('extracts stable ids from 1688 inputs', () => { + expect(__test__.extractOfferId('887904326744')).toBe('887904326744'); + expect(__test__.extractOfferId('https://detail.1688.com/offer/887904326744.html')).toBe('887904326744'); + expect(__test__.extractMemberId('https://winport.m.1688.com/page/index.html?memberId=b2b-1641351767')).toBe('b2b-1641351767'); + expect(__test__.extractMemberId('b2b-22154705262941f196')).toBe('b2b-22154705262941f196'); + expect(__test__.resolveStoreUrl('b2b-22154705262941f196')).toBe( + 'https://winport.m.1688.com/page/index.html?memberId=b2b-22154705262941f196', + ); + expect(__test__.extractShopId('https://yinuoweierfushi.1688.com/page/index.html')).toBe('yinuoweierfushi'); + }); + + it('parses price ranges and moq text', () => { + expect(__test__.parsePriceText('¥96.00-98.00')).toEqual({ + price_text: '¥96.00-98.00', + price_min: 96, + price_max: 98, + currency: 'CNY', + }); + + expect(__test__.parsePriceText('¥ 14 .28')).toEqual({ + price_text: '¥14.28', + price_min: 14.28, + price_max: 14.28, + currency: 'CNY', + }); + + expect(__test__.parseMoqText('3套起批')).toEqual({ + moq_text: '3套起批', + moq_value: 3, + }); + + expect(__test__.parseMoqText('2~999个')).toEqual({ + moq_text: '2~999个', + moq_value: 2, + }); + }); + + it('extracts location and captcha states', () => { + expect(__test__.extractLocation('山东青岛 送至 江苏苏州')).toBe('山东青岛'); + expect(__test__.isCaptchaState({ + href: 'https://s.1688.com/_____tmd_____/punish', + title: '验证码拦截', + body_text: '请拖动下方滑块完成验证', + })).toBe(true); + }); +}); diff --git a/src/clis/1688/shared.ts b/src/clis/1688/shared.ts new file mode 100644 index 00000000..383199df --- /dev/null +++ b/src/clis/1688/shared.ts @@ -0,0 +1,527 @@ +import { ArgumentError, AuthRequiredError, CommandExecutionError } from '../../errors.js'; +import type { IPage } from '../../types.js'; + +export const SITE = '1688'; +export const HOME_URL = 'https://www.1688.com/'; +export const SEARCH_URL_PREFIX = 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords='; +export const DETAIL_URL_PREFIX = 'https://detail.1688.com/offer/'; +export const STORE_MOBILE_URL_PREFIX = 'https://winport.m.1688.com/page/index.html?memberId='; +export const STRATEGY = 'cookie'; + +const CAPTCHA_URL_MARKER = '/_____tmd_____/punish'; +const CAPTCHA_TEXT_PATTERNS = [ + '请拖动下方滑块完成验证', + '请按住滑块,拖动到最右边', + '通过验证以确保正常访问', + '验证码拦截', + '访问验证', + '滑动验证', +]; +export const FACTORY_BADGE_PATTERNS = [ + '源头工厂', + '深度验厂', + '实力工厂', + '工厂档案', + '加工专区', + '验厂报告', + '厂家直销', + '生产厂家', + '工厂直供', +]; +export const SERVICE_BADGE_PATTERNS = [ + '延期必赔', + '品质保障', + '破损包赔', + '退货包运费', + '晚发必赔', + '7*24小时响应', + '48小时发货', + '72小时发货', + '后天达', + '包邮', + '闪电拿样', +]; +const CHINA_LOCATIONS = [ + '北京', + '天津', + '上海', + '重庆', + '河北', + '山西', + '辽宁', + '吉林', + '黑龙江', + '江苏', + '浙江', + '安徽', + '福建', + '江西', + '山东', + '河南', + '湖北', + '湖南', + '广东', + '海南', + '四川', + '贵州', + '云南', + '陕西', + '甘肃', + '青海', + '台湾', + '内蒙古', + '广西', + '西藏', + '宁夏', + '新疆', + '香港', + '澳门', +]; + +export interface ProvenanceFields { + source_url: string; + fetched_at: string; + strategy: string; +} + +export interface PageState { + href: string; + title: string; + body_text: string; +} + +export interface PriceRange { + price_text: string; + price_min: number | null; + price_max: number | null; + currency: string | null; +} + +export interface MoqValue { + moq_text: string; + moq_value: number | null; +} + +export interface PriceTier { + quantity_text: string; + quantity_min: number | null; + price_text: string; + price: number | null; + currency: string | null; +} + +export interface SearchCandidate { + item_url: string; + title: string; + container_text: string; + seller_name: string | null; + seller_url: string | null; +} + +export function cleanText(value: unknown): string { + return typeof value === 'string' + ? value.replace(/\u00a0/g, ' ').replace(/\s+/g, ' ').trim() + : ''; +} + +export function cleanMultilineText(value: unknown): string { + return typeof value === 'string' + ? value + .replace(/\u00a0/g, ' ') + .split('\n') + .map((line) => line.replace(/\s+/g, ' ').trim()) + .filter(Boolean) + .join('\n') + : ''; +} + +export function uniqueNonEmpty(values: Array): string[] { + return [...new Set(values.map((value) => cleanText(value)).filter(Boolean))]; +} + +export function buildSearchUrl(query: string): string { + const normalized = cleanText(query); + if (!normalized) { + throw new ArgumentError('1688 search query cannot be empty'); + } + return `${SEARCH_URL_PREFIX}${encodeURIComponent(normalized)}`; +} + +export function buildDetailUrl(input: string): string { + const offerId = extractOfferId(input); + if (!offerId) { + throw new ArgumentError( + '1688 item expects an offer URL or offer ID', + 'Example: opencli 1688 item 887904326744', + ); + } + return `${DETAIL_URL_PREFIX}${offerId}.html`; +} + +export function resolveStoreUrl(input: string): string { + const normalized = cleanText(input); + if (!normalized) { + throw new ArgumentError('1688 store expects a store URL, shop host, or member ID'); + } + + if (/^https?:\/\//i.test(normalized)) { + return canonicalizeStoreUrl(normalized); + } + + const memberId = extractMemberId(normalized); + if (memberId) { + return `${STORE_MOBILE_URL_PREFIX}${memberId}`; + } + + if (normalized.endsWith('.1688.com')) { + return canonicalizeStoreUrl(`https://${normalized}`); + } + + if (/^[a-z0-9-]+$/i.test(normalized)) { + return canonicalizeStoreUrl(`https://${normalized}.1688.com`); + } + + throw new ArgumentError( + '1688 store expects a store URL, shop host, or member ID', + 'Example: opencli 1688 store https://yinuoweierfushi.1688.com/?offerId=887904326744', + ); +} + +export function canonicalizeStoreUrl(input: string): string { + try { + const url = new URL(input); + if (!url.hostname.endsWith('1688.com')) { + throw new Error('not-1688'); + } + return url.toString(); + } catch { + throw new ArgumentError('Invalid 1688 store URL'); + } +} + +export function extractOfferId(input: string): string | null { + const normalized = cleanText(input); + if (!normalized) return null; + const directId = normalized.match(/^\d{6,}$/)?.[0]; + if (directId) return directId; + const detailMatch = normalized.match(/\/offer\/(\d{6,})\.html/i); + if (detailMatch) return detailMatch[1]; + const queryMatch = normalized.match(/[?&]offerId=(\d{6,})/i); + if (queryMatch) return queryMatch[1]; + return null; +} + +export function extractMemberId(input: string): string | null { + const normalized = cleanText(input); + if (!normalized) return null; + const direct = normalized.match(/\bb2b-[a-z0-9]+\b/i)?.[0]; + if (direct) return direct; + const queryMatch = normalized.match(/[?&]memberId=(b2b-[a-z0-9]+)/i); + if (queryMatch) return queryMatch[1]; + const mobileMatch = normalized.match(/\/winport\/(b2b-[a-z0-9]+)\.html/i); + if (mobileMatch) return mobileMatch[1]; + return null; +} + +export function extractShopId(input: string): string | null { + const normalized = cleanText(input); + if (!normalized) return null; + try { + const url = new URL(/^https?:\/\//i.test(normalized) ? normalized : `https://${normalized}`); + const [subdomain] = url.hostname.split('.'); + if (!subdomain || ['www', 'detail', 's', 'winport', 'work'].includes(subdomain)) { + return null; + } + return subdomain; + } catch { + return /^[a-z0-9-]+$/i.test(normalized) ? normalized : null; + } +} + +export function buildProvenance(sourceUrl: string): ProvenanceFields { + return { + source_url: sourceUrl, + fetched_at: new Date().toISOString(), + strategy: STRATEGY, + }; +} + +export function parsePriceText(text: string): PriceRange { + const normalized = normalizeNumericText(cleanText(text)); + const matches = normalized.match(/\d+(?:,\d{3})*(?:\.\d+)?/g) ?? []; + const values = matches + .map((value) => Number.parseFloat(value.replace(/,/g, ''))) + .filter((value) => Number.isFinite(value)); + + if (values.length === 0) { + return { + price_text: normalized, + price_min: null, + price_max: null, + currency: null, + }; + } + + return { + price_text: normalized, + price_min: values[0] ?? null, + price_max: values[values.length - 1] ?? values[0] ?? null, + currency: normalized.includes('¥') || normalized.includes('元') ? 'CNY' : null, + }; +} + +export function normalizePriceTiers( + rawTiers: Array<{ beginAmount?: unknown; price?: unknown }>, + unit: string | null, +): PriceTier[] { + return rawTiers + .map((tier) => { + const quantityMin = toNumber(tier.beginAmount); + const priceText = cleanText(tier.price); + const price = toNumber(tier.price); + return { + quantity_text: quantityMin !== null + ? `${quantityMin}${unit ?? ''}` + : '', + quantity_min: quantityMin, + price_text: priceText, + price, + currency: priceText ? 'CNY' : null, + }; + }) + .filter((tier) => tier.price_text); +} + +export function parseMoqText(text: string): MoqValue { + const normalized = normalizeNumericText(cleanText(text)); + const match = normalized.match(/(\d+(?:\.\d+)?)\s*(件|个|套|箱|包|双|台|把|只|pcs|piece|pieces)?\s*起批/i) + ?? normalized.match(/≥\s*(\d+(?:\.\d+)?)/); + const rangeMatch = normalized.match( + /(\d+(?:\.\d+)?)\s*(?:~|-|至|到)\s*\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只|pcs|piece|pieces)/i, + ); + + if (!match && !rangeMatch) { + return { + moq_text: normalized, + moq_value: null, + }; + } + + return { + moq_text: normalized, + moq_value: Number.parseFloat((match ?? rangeMatch)![1]), + }; +} + +export function extractLocation(text: string): string | null { + const normalized = cleanMultilineText(text); + const primaryRegion = normalized.split(/送至|发往/)[0] ?? normalized; + const lines = primaryRegion.split('\n'); + for (const line of lines) { + const compact = cleanText(line); + if (!compact || compact.length > 16) continue; + if (CHINA_LOCATIONS.some((location) => compact.startsWith(location))) { + return compact; + } + } + + const locationPattern = new RegExp(`(${CHINA_LOCATIONS.join('|')})[\\u4e00-\\u9fa5]{0,8}`); + return primaryRegion.match(locationPattern)?.[0] ?? null; +} + +export function extractAddress(text: string): string | null { + const normalized = cleanMultilineText(text); + const lineMatch = normalized.match(/地址[::]\s*([^\n]+)/); + if (lineMatch) return cleanText(lineMatch[1]); + return normalized + .split('\n') + .map((line) => cleanText(line)) + .find((line) => line.includes('省') || line.includes('市') || line.includes('区') || line.includes('县')) + ?? null; +} + +export function extractMetric(text: string, label: string): string | null { + const normalized = cleanMultilineText(text); + const direct = normalized.match(new RegExp(`${escapeForRegex(label)}[::]?\\s*([^\\n]+)`)); + if (direct) return cleanText(direct[1]); + + const lineBased = normalized.match(new RegExp(`${escapeForRegex(label)}\\n([^\\n]+)`)); + return lineBased ? cleanText(lineBased[1]) : null; +} + +export function extractYearsOnPlatform(text: string): string | null { + return text.match(/入驻\d+年/)?.[0] ?? null; +} + +export function extractMainBusiness(text: string): string | null { + const value = extractMetric(text, '主营'); + return value ? value.replace(/^:/, '').trim() : null; +} + +export function extractBadges(text: string, candidates: string[]): string[] { + return uniqueNonEmpty( + candidates.filter((candidate) => cleanMultilineText(text).includes(candidate)), + ); +} + +export function guessTopCategories(text: string): string[] { + const mainBusiness = extractMainBusiness(text); + if (!mainBusiness) return []; + return uniqueNonEmpty(mainBusiness.split(/[、,/|]/).map((value) => value.trim())); +} + +export function isCaptchaState(state: Partial): boolean { + const href = cleanText(state.href).toLowerCase(); + const title = cleanText(state.title); + const bodyText = cleanMultilineText(state.body_text); + if (href.includes(CAPTCHA_URL_MARKER)) return true; + return CAPTCHA_TEXT_PATTERNS.some((pattern) => title.includes(pattern) || bodyText.includes(pattern)); +} + +export function buildCaptchaHint(action: string): string { + return [ + `Open a clean 1688 ${action} page in the shared Chrome profile and finish any slider challenge first.`, + 'If you run opencli via CDP, set OPENCLI_CDP_TARGET=1688.com or a more specific 1688 host before retrying.', + ].join(' '); +} + +export async function readPageState(page: IPage): Promise { + const result = await page.evaluate(` + (() => ({ + href: window.location.href, + title: document.title || '', + body_text: document.body ? document.body.innerText || '' : '', + }))() + `) as Partial; + + return { + href: cleanText(result.href), + title: cleanText(result.title), + body_text: cleanMultilineText(result.body_text), + }; +} + +export async function gotoAndReadState( + page: IPage, + url: string, + settleMs: number = 2500, + action: string = 'page', +): Promise { + try { + await page.goto(url, { settleMs }); + await page.wait(1.5); + return readPageState(page); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if ( + message.includes('Inspected target navigated or closed') + || message.includes('Cannot find context with specified id') + || message.includes('Target closed') + ) { + throw new CommandExecutionError( + `1688 ${action} navigation lost the current browser target`, + `${buildCaptchaHint(action)} If CDP is attached to a stale or blocked tab, open a fresh 1688 tab and point OPENCLI_CDP_TARGET at that tab.`, + ); + } + throw error; + } +} + +export async function ensure1688Session(page: IPage): Promise { + const state = await gotoAndReadState(page, HOME_URL, 1500); + if (isCaptchaState(state)) { + throw new CommandExecutionError( + '1688 homepage is currently blocked by a slider challenge', + buildCaptchaHint('homepage'), + ); + } + + const authState = await page.evaluate(` + (() => { + const text = document.body ? document.body.innerText || '' : ''; + const hasSearchInput = !!document.querySelector('input#alisearch-input, input[name="keywords"]'); + const hasLoggedMarker = ['采购车', '收藏的品', '我的足迹', '全部订单'] + .some((label) => text.includes(label)); + const hasLoginPrompt = ['请登录', '立即登录', '登录后'] + .some((label) => text.includes(label)); + return { + hasSearchInput, + hasLoggedMarker, + hasLoginPrompt, + }; + })() + `) as { hasSearchInput?: boolean; hasLoggedMarker?: boolean; hasLoginPrompt?: boolean }; + + const isLoggedIn = authState.hasSearchInput === true + && authState.hasLoggedMarker === true + && authState.hasLoginPrompt !== true; + + if (!isLoggedIn) { + throw new AuthRequiredError( + '1688.com', + '1688 is not logged in in the shared Chrome profile', + ); + } +} + +export function assertNotCaptcha(state: PageState, action: string): void { + if (!isCaptchaState(state)) return; + throw new CommandExecutionError( + `1688 ${action} hit a slider challenge`, + buildCaptchaHint(action), + ); +} + +export function toNumber(value: unknown): number | null { + if (typeof value === 'number' && Number.isFinite(value)) { + return value; + } + if (typeof value === 'string') { + const normalized = value.replace(/,/g, '').trim(); + if (!normalized) return null; + const parsed = Number.parseFloat(normalized); + return Number.isFinite(parsed) ? parsed : null; + } + return null; +} + +export function limitCandidates(values: T[], limit: number): T[] { + const normalizedLimit = Math.max(1, Math.trunc(limit) || 1); + return values.slice(0, normalizedLimit); +} + +function normalizeNumericText(value: string): string { + return value + .replace(/([¥$€])\s+(?=\d)/g, '$1') + .replace(/(\d)\s*\.\s*(\d)/g, '$1.$2') + .replace(/\s*([~-])\s*/g, '$1') + .trim(); +} + +function escapeForRegex(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +export const __test__ = { + buildSearchUrl, + buildDetailUrl, + resolveStoreUrl, + extractOfferId, + extractMemberId, + extractShopId, + parsePriceText, + normalizePriceTiers, + parseMoqText, + extractLocation, + extractAddress, + extractMetric, + extractYearsOnPlatform, + extractMainBusiness, + extractBadges, + guessTopCategories, + isCaptchaState, + cleanText, + cleanMultilineText, + uniqueNonEmpty, + limitCandidates, +}; diff --git a/src/clis/1688/store.test.ts b/src/clis/1688/store.test.ts new file mode 100644 index 00000000..bf4bcc6c --- /dev/null +++ b/src/clis/1688/store.test.ts @@ -0,0 +1,63 @@ +import { describe, expect, it } from 'vitest'; +import { __test__ } from './store.js'; + +describe('1688 store normalization', () => { + it('merges store contact text with seller seed data', () => { + const result = __test__.normalizeStorePayload({ + resolvedUrl: 'https://yinuoweierfushi.1688.com/?offerId=887904326744', + explicitMemberId: null, + storePayload: { + href: 'https://yinuoweierfushi.1688.com/page/index.html', + bodyText: ` + 青岛沁澜衣品服装有限公司 + 联系方式 + 地址:山东省青岛市即墨区环秀街道办事处湘江二路97号甲 + `, + offerLinks: ['https://detail.1688.com/offer/887904326744.html'], + }, + contactPayload: { + href: 'https://yinuoweierfushi.1688.com/page/contactinfo.html', + bodyText: ` + 青岛沁澜衣品服装有限公司 + 电话:86 0532 86655366 + 手机:15963238678 + 地址:山东省青岛市即墨区环秀街道办事处湘江二路97号甲 + `, + }, + seed: { + bodyText: ` + 入驻13年 + 主营:大码女装 + 店铺回头率 + 87% + 延期必赔 + 品质保障 + `, + seller: { + companyName: '青岛沁澜衣品服装有限公司', + memberId: 'b2b-1641351767', + winportUrl: 'https://yinuoweierfushi.1688.com', + }, + services: [{ serviceName: '延期必赔' }, { serviceName: '品质保障' }], + }, + }); + + expect(result.member_id).toBe('b2b-1641351767'); + expect(result.store_url).toBe('https://yinuoweierfushi.1688.com'); + expect(result.company_url).toBe('https://yinuoweierfushi.1688.com/page/contactinfo.html'); + expect(result.years_on_platform_text).toBe('入驻13年'); + expect(result.location).toBe('山东省青岛市即墨区环秀街道办事处湘江二路97号甲'); + expect(result.return_rate_text).toBe('87%'); + expect(result.top_categories).toEqual(['大码女装']); + expect(result.service_badges).toEqual(['延期必赔', '品质保障']); + }); + + it('builds contact urls and extracts offer ids', () => { + expect(__test__.buildContactUrl('https://yinuoweierfushi.1688.com')).toBe( + 'https://yinuoweierfushi.1688.com/page/contactinfo.html', + ); + expect(__test__.firstOfferId([ + 'https://detail.1688.com/offer/887904326744.html', + ])).toBe('887904326744'); + }); +}); diff --git a/src/clis/1688/store.ts b/src/clis/1688/store.ts new file mode 100644 index 00000000..5f7365ed --- /dev/null +++ b/src/clis/1688/store.ts @@ -0,0 +1,260 @@ +import { CommandExecutionError } from '../../errors.js'; +import { cli, Strategy } from '../../registry.js'; +import type { IPage } from '../../types.js'; +import { + FACTORY_BADGE_PATTERNS, + SERVICE_BADGE_PATTERNS, + assertNotCaptcha, + buildCaptchaHint, + buildDetailUrl, + buildProvenance, + cleanMultilineText, + cleanText, + extractAddress, + extractBadges, + extractMainBusiness, + extractMemberId, + extractMetric, + extractOfferId, + extractShopId, + extractYearsOnPlatform, + gotoAndReadState, + guessTopCategories, + resolveStoreUrl, + uniqueNonEmpty, +} from './shared.js'; + +interface StoreBrowserPayload { + href?: string; + title?: string; + bodyText?: string; + offerLinks?: string[]; + contactLinks?: string[]; +} + +interface StoreItemSeed { + href?: string; + bodyText?: string; + seller?: { + companyName?: string; + memberId?: string; + winportUrl?: string; + sellerWinportUrlMap?: Record; + }; + services?: Array<{ serviceName?: string }>; +} + +function normalizeStorePayload(input: { + resolvedUrl: string; + storePayload: StoreBrowserPayload | null; + contactPayload: StoreBrowserPayload | null; + seed: StoreItemSeed | null; + explicitMemberId: string | null; +}): Record { + const storePayload = input.storePayload; + const contactPayload = input.contactPayload; + const seed = input.seed; + + const contactText = cleanMultilineText(contactPayload?.bodyText); + const storeText = cleanMultilineText(storePayload?.bodyText); + const seedText = cleanMultilineText(seed?.bodyText); + const combinedText = [contactText, storeText, seedText].filter(Boolean).join('\n'); + + const sellerUrl = cleanText( + seed?.seller?.winportUrl + ?? seed?.seller?.sellerWinportUrlMap?.defaultUrl + ?? storePayload?.href + ?? input.resolvedUrl, + ); + const memberId = cleanText(seed?.seller?.memberId) + || input.explicitMemberId + || extractMemberId(input.resolvedUrl) + || null; + const shopId = extractShopId(sellerUrl) ?? extractShopId(input.resolvedUrl); + const companyName = cleanText(seed?.seller?.companyName) + || firstNamedLine(contactText) + || firstNamedLine(storeText) + || null; + const storeUrl = canonicalStoreUrl(sellerUrl || input.resolvedUrl); + const companyUrl = buildContactUrl(storeUrl) ?? storeUrl; + const serviceBadges = uniqueNonEmpty([ + ...extractBadges(combinedText, SERVICE_BADGE_PATTERNS), + ...((seed?.services ?? []).map((service) => cleanText(service.serviceName))), + ]); + const factoryBadges = extractBadges(combinedText, FACTORY_BADGE_PATTERNS); + + return { + member_id: memberId, + shop_id: shopId, + store_name: companyName, + store_url: storeUrl, + company_name: companyName, + company_url: companyUrl, + ...buildProvenance(contactPayload?.href || storePayload?.href || input.resolvedUrl), + business_model_text: firstMetric(combinedText, ['经营模式', '生产加工', '主营产品']), + years_on_platform_text: extractYearsOnPlatform(combinedText), + location: extractAddress(contactText) ?? extractAddress(storeText), + staff_size_text: firstMetric(combinedText, ['员工人数', '员工总数']), + factory_badges: factoryBadges, + service_badges: serviceBadges, + response_rate_text: firstMetric(combinedText, ['响应率', '回复率', '响应速度']), + return_rate_text: extractReturnRate(combinedText), + top_categories: guessTopCategories(combinedText), + phone_text: extractMetric(contactText, '电话'), + mobile_text: extractMetric(contactText, '手机'), + }; +} + +function canonicalStoreUrl(url: string): string { + try { + const parsed = new URL(url); + return `${parsed.protocol}//${parsed.hostname}`; + } catch { + return url; + } +} + +function buildContactUrl(storeUrl: string): string | null { + try { + const parsed = new URL(storeUrl); + return `${parsed.protocol}//${parsed.hostname}/page/contactinfo.html`; + } catch { + return null; + } +} + +function firstNamedLine(text: string): string | null { + return text + .split('\n') + .map((line) => cleanText(line)) + .find((line) => line.includes('有限公司') || line.includes('商行') || line.includes('工厂')) + ?? null; +} + +function firstMetric(text: string, labels: string[]): string | null { + for (const label of labels) { + const value = extractMetric(text, label); + if (value) return value; + } + return null; +} + +function extractReturnRate(text: string): string | null { + const inline = text.match(/回头率\s*([0-9.]+%)/); + if (inline) return inline[1]; + const multiline = text.match(/回头率\n([0-9.]+%)/); + return multiline ? multiline[1] : null; +} + +function firstOfferId(links: string[]): string | null { + for (const link of links) { + const offerId = extractOfferId(link); + if (offerId) return offerId; + } + return null; +} + +async function readStorePayload( + page: IPage, + url: string, + action: string, +): Promise { + const state = await gotoAndReadState(page, url, 2500, action); + assertNotCaptcha(state, action); + + return await page.evaluate(` + (() => ({ + href: window.location.href, + title: document.title || '', + bodyText: document.body ? document.body.innerText || '' : '', + offerLinks: Array.from(document.querySelectorAll('a[href*="detail.1688.com/offer/"]')) + .map((anchor) => anchor.href) + .filter(Boolean), + contactLinks: Array.from(document.querySelectorAll('a[href*="contactinfo"]')) + .map((anchor) => anchor.href) + .filter(Boolean), + }))() + `) as StoreBrowserPayload; +} + +async function readItemSeed( + page: IPage, + offerId: string, +): Promise { + const itemUrl = buildDetailUrl(offerId); + const state = await gotoAndReadState(page, itemUrl, 2500, 'store seed item'); + assertNotCaptcha(state, 'store seed item'); + + const seed = await page.evaluate(` + (() => { + const model = window.context?.result?.global?.globalData?.model ?? null; + const toJson = (value) => JSON.parse(JSON.stringify(value ?? null)); + return { + href: window.location.href, + bodyText: document.body ? document.body.innerText || '' : '', + seller: toJson(model?.sellerModel), + services: toJson(model?.shippingServices?.fields?.buyerProtectionModel ?? []), + }; + })() + `) as StoreItemSeed; + + if (!cleanText(seed.href) || !seed.seller) { + throw new CommandExecutionError( + '1688 store seed item did not expose seller context', + `${buildCaptchaHint('item')} Open a real 1688 item page in Chrome and retry.`, + ); + } + + return seed; +} + +cli({ + site: '1688', + name: 'store', + description: '1688 店铺/供应商公开信息(联系方式、主营、入驻年限、公开服务信号)', + domain: 'www.1688.com', + strategy: Strategy.COOKIE, + navigateBefore: false, + args: [ + { + name: 'input', + required: true, + positional: true, + help: '1688 店铺 URL、店铺 host 或 member ID(如 b2b-22154705262941f196)', + }, + ], + columns: ['company_name', 'years_on_platform_text', 'location', 'return_rate_text'], + func: async (page, kwargs) => { + const rawInput = String(kwargs.input ?? ''); + const resolvedUrl = resolveStoreUrl(rawInput); + const explicitMemberId = extractMemberId(rawInput); + + const storePayload = await readStorePayload(page, resolvedUrl, 'store'); + const contactUrl = buildContactUrl(storePayload.href || resolvedUrl); + const contactPayload = contactUrl ? await readStorePayload(page, contactUrl, 'store contact') : null; + const offerId = extractOfferId(rawInput) + || firstOfferId(storePayload.offerLinks ?? []) + || firstOfferId(contactPayload?.offerLinks ?? []); + const seed = offerId ? await readItemSeed(page, offerId) : null; + + return [ + normalizeStorePayload({ + resolvedUrl, + storePayload, + contactPayload, + seed, + explicitMemberId, + }), + ]; + }, +}); + +export const __test__ = { + normalizeStorePayload, + canonicalStoreUrl, + buildContactUrl, + firstNamedLine, + firstMetric, + extractReturnRate, + firstOfferId, +}; From 5ae05257d0bf0af077d82756f998d8c3c74e5ec7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B3=BD=E5=8A=A0=E6=AD=A6?= Date: Wed, 1 Apr 2026 16:50:49 +0800 Subject: [PATCH 2/3] fix(1688): retry alternate store seed offers --- src/clis/1688/shared.test.ts | 3 +++ src/clis/1688/shared.ts | 4 ++-- src/clis/1688/store.test.ts | 22 ++++++++++++++++++++ src/clis/1688/store.ts | 39 ++++++++++++++++++++++++++++++++---- 4 files changed, 62 insertions(+), 6 deletions(-) diff --git a/src/clis/1688/shared.test.ts b/src/clis/1688/shared.test.ts index 30a70b78..3395fc1d 100644 --- a/src/clis/1688/shared.test.ts +++ b/src/clis/1688/shared.test.ts @@ -47,6 +47,9 @@ describe('1688 shared helpers', () => { it('extracts location and captcha states', () => { expect(__test__.extractLocation('山东青岛 送至 江苏苏州')).toBe('山东青岛'); + expect(__test__.extractMetric(`主营:家装建材 +地址:江苏省常州市武进区横林镇崔桥崔卫路40号`, '主营')).toBe('家装建材'); + expect(__test__.extractMetric('常州市优品诺家居科技有限公司是家居用品、家居用品等产品专业生产加工的公司', '生产加工')).toBe(null); expect(__test__.isCaptchaState({ href: 'https://s.1688.com/_____tmd_____/punish', title: '验证码拦截', diff --git a/src/clis/1688/shared.ts b/src/clis/1688/shared.ts index 383199df..b4a904bc 100644 --- a/src/clis/1688/shared.ts +++ b/src/clis/1688/shared.ts @@ -342,10 +342,10 @@ export function extractAddress(text: string): string | null { export function extractMetric(text: string, label: string): string | null { const normalized = cleanMultilineText(text); - const direct = normalized.match(new RegExp(`${escapeForRegex(label)}[::]?\\s*([^\\n]+)`)); + const direct = normalized.match(new RegExp(`(?:^|\\n)\\s*${escapeForRegex(label)}[::]?\\s*([^\\n]+)`)); if (direct) return cleanText(direct[1]); - const lineBased = normalized.match(new RegExp(`${escapeForRegex(label)}\\n([^\\n]+)`)); + const lineBased = normalized.match(new RegExp(`(?:^|\\n)\\s*${escapeForRegex(label)}\\n([^\\n]+)`)); return lineBased ? cleanText(lineBased[1]) : null; } diff --git a/src/clis/1688/store.test.ts b/src/clis/1688/store.test.ts index bf4bcc6c..a005a65e 100644 --- a/src/clis/1688/store.test.ts +++ b/src/clis/1688/store.test.ts @@ -60,4 +60,26 @@ describe('1688 store normalization', () => { 'https://detail.1688.com/offer/887904326744.html', ])).toBe('887904326744'); }); + + it('collects deduplicated offer ids from input and store links', () => { + expect(__test__.collectOfferIds( + 'https://detail.1688.com/offer/887904326744.html', + { + href: 'https://yinuoweierfushi.1688.com/page/index.html', + bodyText: '', + offerLinks: [ + 'https://detail.1688.com/offer/887904326744.html', + 'https://detail.1688.com/offer/123456789012.html', + ], + }, + { + href: 'https://yinuoweierfushi.1688.com/page/contactinfo.html', + bodyText: '', + offerLinks: [ + 'https://detail.1688.com/offer/123456789012.html', + 'https://detail.1688.com/offer/999999999999.html', + ], + }, + )).toEqual(['887904326744', '123456789012', '999999999999']); + }); }); diff --git a/src/clis/1688/store.ts b/src/clis/1688/store.ts index 5f7365ed..da7024e4 100644 --- a/src/clis/1688/store.ts +++ b/src/clis/1688/store.ts @@ -44,6 +44,22 @@ interface StoreItemSeed { services?: Array<{ serviceName?: string }>; } +function collectOfferIds( + rawInput: string, + storePayload: StoreBrowserPayload | null, + contactPayload: StoreBrowserPayload | null, +): string[] { + const ids = uniqueNonEmpty([ + rawInput, + ...(storePayload?.offerLinks ?? []), + ...(contactPayload?.offerLinks ?? []), + ]) + .map((value) => extractOfferId(value)) + .filter((value): value is string => Boolean(value)); + + return [...new Set(ids)]; +} + function normalizeStorePayload(input: { resolvedUrl: string; storePayload: StoreBrowserPayload | null; @@ -208,6 +224,20 @@ async function readItemSeed( return seed; } +async function readFirstUsableItemSeed( + page: IPage, + offerIds: string[], +): Promise { + for (const offerId of offerIds.slice(0, 8)) { + try { + return await readItemSeed(page, offerId); + } catch (err) { + if (!(err instanceof CommandExecutionError)) throw err; + } + } + return null; +} + cli({ site: '1688', name: 'store', @@ -232,10 +262,10 @@ cli({ const storePayload = await readStorePayload(page, resolvedUrl, 'store'); const contactUrl = buildContactUrl(storePayload.href || resolvedUrl); const contactPayload = contactUrl ? await readStorePayload(page, contactUrl, 'store contact') : null; - const offerId = extractOfferId(rawInput) - || firstOfferId(storePayload.offerLinks ?? []) - || firstOfferId(contactPayload?.offerLinks ?? []); - const seed = offerId ? await readItemSeed(page, offerId) : null; + const seed = await readFirstUsableItemSeed( + page, + collectOfferIds(rawInput, storePayload, contactPayload), + ); return [ normalizeStorePayload({ @@ -257,4 +287,5 @@ export const __test__ = { firstMetric, extractReturnRate, firstOfferId, + collectOfferIds, }; From f853ba6508f60bf1a2c4ba14f547b48831e4708f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B3=BD=E5=8A=A0=E6=AD=A6?= Date: Fri, 3 Apr 2026 11:52:12 +0800 Subject: [PATCH 3/3] feat(1688): harden adapter contracts and search pagination --- docs/adapters/browser/1688.md | 4 +- src/clis/1688/item.test.ts | 13 +- src/clis/1688/item.ts | 58 +++++---- src/clis/1688/search.test.ts | 23 +++- src/clis/1688/search.ts | 239 ++++++++++++++++++++++++---------- src/clis/1688/shared.test.ts | 28 +++- src/clis/1688/shared.ts | 228 ++++++++++++++++++++++---------- src/clis/1688/store.test.ts | 32 ++--- src/clis/1688/store.ts | 143 ++++++++++---------- 9 files changed, 497 insertions(+), 271 deletions(-) diff --git a/docs/adapters/browser/1688.md b/docs/adapters/browser/1688.md index 53364f56..ba82ec9e 100644 --- a/docs/adapters/browser/1688.md +++ b/docs/adapters/browser/1688.md @@ -6,7 +6,7 @@ | Command | Description | |---------|-------------| -| `opencli 1688 search ""` | Search public product candidates with price, MOQ, seller link, and visible badges | +| `opencli 1688 search "" --limit ` | Search public product candidates with price, MOQ, seller link, and visible badges | | `opencli 1688 item ` | Read a public product detail page with price tiers, MOQ, delivery text, and seller basics | | `opencli 1688 store ` | Read a public supplier/store page with company info, years on platform, categories, and visible service signals | @@ -41,6 +41,8 @@ opencli 1688 store b2b-22154705262941f196 -f json - This adapter only returns fields visible on public pages. It does not send inquiries, place orders, or access seller back office data. - Prefer stable identifiers such as `offer_id`, `member_id`, and `shop_id` for follow-up workflows. +- `search --limit` defaults to `20` and is capped at `100`. +- `search` deduplicates with key priority: `offer_id` first, then canonical `item_url`. - `item` can be more sensitive to the active browser target than `search` or `store`. ## Troubleshooting diff --git a/src/clis/1688/item.test.ts b/src/clis/1688/item.test.ts index 9e9394d2..209847a0 100644 --- a/src/clis/1688/item.test.ts +++ b/src/clis/1688/item.test.ts @@ -2,7 +2,7 @@ import { describe, expect, it } from 'vitest'; import { __test__ } from './item.js'; describe('1688 item normalization', () => { - it('normalizes public item payload into reportable fields', () => { + it('normalizes public item payload into contract fields', () => { const result = __test__.normalizeItemPayload({ href: 'https://detail.1688.com/offer/887904326744.html', title: '法式春季长袖开衫连衣裙女新款大码女装碎花吊带裙套装142077 - 阿里巴巴', @@ -22,7 +22,7 @@ describe('1688 item normalization', () => { seller: { companyName: '青岛沁澜衣品服装有限公司', memberId: 'b2b-1641351767', - winportUrl: 'https://yinuoweierfushi.1688.com', + winportUrl: 'https://yinuoweierfushi.1688.com/page/index.html?spm=a1', }, trade: { beginAmount: 3, @@ -55,14 +55,15 @@ describe('1688 item normalization', () => { expect(result.offer_id).toBe('887904326744'); expect(result.member_id).toBe('b2b-1641351767'); expect(result.shop_id).toBe('yinuoweierfushi'); + expect(result.seller_url).toBe('https://yinuoweierfushi.1688.com'); expect(result.price_text).toBe('¥96.00-98.00'); expect(result.moq_text).toBe('3套起批'); expect(result.origin_place).toBe('山东青岛'); expect(result.delivery_days_text).toBe('360小时内发货'); expect(result.private_label_text).toBe('支持定制logo'); - expect(result.visible_attributes).toEqual({ - 面料名称: '莫代尔', - 主面料成分: '莫代尔纤维', - }); + expect(result.visible_attributes).toEqual([ + { key: '面料名称', value: '莫代尔' }, + { key: '主面料成分', value: '莫代尔纤维' }, + ]); }); }); diff --git a/src/clis/1688/item.ts b/src/clis/1688/item.ts index 1db98bef..79c03c3e 100644 --- a/src/clis/1688/item.ts +++ b/src/clis/1688/item.ts @@ -3,10 +3,10 @@ import { cli, Strategy } from '../../registry.js'; import type { IPage } from '../../types.js'; import { isRecord } from '../../utils.js'; import { - assertNotCaptcha, - buildCaptchaHint, + assertAuthenticatedState, buildDetailUrl, buildProvenance, + canonicalizeSellerUrl, cleanMultilineText, cleanText, extractLocation, @@ -65,18 +65,24 @@ interface ItemBrowserPayload { services?: BuyerProtectionModel[]; } +interface VisibleAttribute { + key: string; + value: string; +} + function normalizeItemPayload(payload: ItemBrowserPayload): Record { const href = cleanText(payload.href); const bodyText = cleanMultilineText(payload.bodyText); const sellerName = cleanText(payload.seller?.companyName); - const sellerUrl = cleanText( + const sellerUrlRaw = cleanText( payload.seller?.winportUrl ?? payload.seller?.sellerWinportUrlMap?.defaultUrl ?? payload.seller?.sellerWinportUrlMap?.indexUrl, ); - const offerId = cleanText(String(payload.offerId ?? '')) || extractOfferId(href) || ''; - const memberId = cleanText(payload.seller?.memberId) || extractMemberId(href) || null; - const shopId = extractShopId(sellerUrl) ?? extractShopId(href); + const sellerUrl = canonicalizeSellerUrl(sellerUrlRaw); + const offerId = cleanText(String(payload.offerId ?? '')) || extractOfferId(href) || null; + const memberId = cleanText(payload.seller?.memberId) || extractMemberId(sellerUrlRaw || href) || null; + const shopId = extractShopId(sellerUrl ?? href); const unit = cleanText(payload.trade?.unit); const priceDisplay = cleanText(payload.trade?.priceDisplay); const priceRange = parsePriceText(priceDisplay ? `¥${priceDisplay}` : bodyText); @@ -85,9 +91,6 @@ function normalizeItemPayload(payload: ItemBrowserPayload): Record cleanText(service.serviceName))); const attributes = normalizeVisibleAttributes(payload.trade?.offerIDatacenterSellInfo); - - const detailUrl = offerId ? buildDetailUrl(offerId) : href; - const provenance = buildProvenance(href || detailUrl); const priceTiers = normalizePriceTiers(payload.trade?.offerPriceModel?.currentPrices ?? [], unit || null); const images = uniqueNonEmpty([ ...(payload.gallery?.mainImage ?? []), @@ -95,21 +98,23 @@ function normalizeItemPayload(payload: ItemBrowserPayload): Record item.fullPathImageURI ?? '')), ]); + const detailUrl = offerId ? buildDetailUrl(offerId) : href; + const provenance = buildProvenance(href || detailUrl); + return { offer_id: offerId, member_id: memberId, shop_id: shopId, - title: cleanText(payload.offerTitle) || stripAlibabaSuffix(payload.title) || firstNonEmptyLine(bodyText), + title: cleanText(payload.offerTitle) || stripAlibabaSuffix(payload.title) || firstNonEmptyLine(bodyText) || null, item_url: detailUrl, - ...provenance, main_images: images, - price_text: priceRange.price_text, + price_text: priceRange.price_text || null, price_tiers: priceTiers, - currency: priceRange.currency ?? 'CNY', - moq_text: moq.moq_text, + currency: priceRange.currency, + moq_text: moq.moq_text || null, moq_value: moq.moq_value, seller_name: sellerName || null, - seller_url: sellerUrl || null, + seller_url: sellerUrl, shop_name: sellerName || null, origin_place: extractLocation(bodyText), delivery_days_text: extractDeliveryDaysText(bodyText, services, payload.shipping), @@ -119,15 +124,15 @@ function normalizeItemPayload(payload: ItemBrowserPayload): Record { - if (!isRecord(raw)) return {}; - const entries = Object.entries(raw) +function normalizeVisibleAttributes(raw: unknown): VisibleAttribute[] { + if (!isRecord(raw)) return []; + return Object.entries(raw) .filter(([key, value]) => key !== 'sellPointModel' && cleanText(key) && cleanText(String(value))) - .map(([key, value]) => [cleanText(key), cleanText(String(value))] as const); - return Object.fromEntries(entries); + .map(([key, value]) => ({ key: cleanText(key), value: cleanText(String(value)) })); } function uniqueServices(payload: ItemBrowserPayload): BuyerProtectionModel[] { @@ -208,10 +213,8 @@ function extractStockQuantity(bodyText: string): number | null { } async function readItemPayload(page: IPage, itemUrl: string): Promise { - let state = await gotoAndReadState(page, itemUrl, 2500, 'item'); - if (state.href && !state.href.includes('/offer/')) { - assertNotCaptcha(state, 'item'); - } + const state = await gotoAndReadState(page, itemUrl, 2500, 'item'); + assertAuthenticatedState(state, 'item'); const payload = await page.evaluate(` (() => { @@ -233,12 +236,11 @@ async function readItemPayload(page: IPage, itemUrl: string): Promise { tag_items: ['退货包运费', '回头率52%'], hover_items: ['验厂报告'], seller_name: '青岛沁澜衣品服装有限公司', - seller_url: 'https://yinuoweierfushi.1688.com', - }, 1, 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=置物架'); + seller_url: 'https://yinuoweierfushi.1688.com/page/index.html?spm=a123', + }, 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=置物架'); - expect(result.rank).toBe(1); + expect(result.rank).toBe(0); expect(result.offer_id).toBe('887904326744'); expect(result.shop_id).toBe('yinuoweierfushi'); + expect(result.item_url).toBe('https://detail.1688.com/offer/887904326744.html'); + expect(result.seller_url).toBe('https://yinuoweierfushi.1688.com'); expect(result.price_text).toBe('¥56.00'); expect(result.price_min).toBe(56); expect(result.price_max).toBe(56); @@ -39,14 +41,27 @@ describe('1688 search normalization', () => { moq_text: '≥2个', seller_name: '泰商国际贸易(宁阳)有限公司', seller_url: 'http://tsgjmy.1688.com/', - }, 1, 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=桌面置物架'); + }, 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=桌面置物架'); expect(result.offer_id).toBe('910933345396'); expect(result.shop_id).toBe('tsgjmy'); + expect(result.item_url).toBe('https://detail.1688.com/offer/910933345396.html'); expect(result.title).toContain('桌面书桌办公室工位收纳展示'); expect(result.price_text).toBe('¥14.28'); expect(result.sales_text).toBe('1500+件'); expect(result.moq_text).toBe('≥2个'); expect(result.moq_value).toBe(2); }); + + it('prefers offer id and falls back to item url for dedupe key', () => { + expect(__test__.buildDedupeKey({ + offer_id: '123456', + item_url: 'https://detail.1688.com/offer/123456.html', + })).toBe('offer:123456'); + expect(__test__.buildDedupeKey({ + offer_id: null, + item_url: 'https://detail.1688.com/offer/123456.html', + })).toBe('url:https://detail.1688.com/offer/123456.html'); + expect(__test__.buildDedupeKey({ offer_id: null, item_url: null })).toBeNull(); + }); }); diff --git a/src/clis/1688/search.ts b/src/clis/1688/search.ts index 87976a84..f3727ab5 100644 --- a/src/clis/1688/search.ts +++ b/src/clis/1688/search.ts @@ -1,12 +1,14 @@ -import { CommandExecutionError } from '../../errors.js'; +import { CommandExecutionError, EmptyResultError } from '../../errors.js'; import { cli, Strategy } from '../../registry.js'; import type { IPage } from '../../types.js'; import { FACTORY_BADGE_PATTERNS, SERVICE_BADGE_PATTERNS, - assertNotCaptcha, + assertAuthenticatedState, buildProvenance, buildSearchUrl, + canonicalizeItemUrl, + canonicalizeSellerUrl, cleanText, extractBadges, extractLocation, @@ -14,9 +16,11 @@ import { extractOfferId, extractShopId, gotoAndReadState, - limitCandidates, parseMoqText, parsePriceText, + SEARCH_LIMIT_DEFAULT, + SEARCH_LIMIT_MAX, + parseSearchLimit, uniqueNonEmpty, } from './shared.js'; @@ -24,6 +28,7 @@ interface SearchPayload { href?: string; title?: string; bodyText?: string; + next_url?: string; candidates?: Array<{ item_url?: string; title?: string; @@ -40,17 +45,41 @@ interface SearchPayload { }>; } +interface SearchRow { + rank: number; + offer_id: string | null; + member_id: string | null; + shop_id: string | null; + title: string | null; + item_url: string | null; + seller_name: string | null; + seller_url: string | null; + price_text: string | null; + price_min: number | null; + price_max: number | null; + currency: string | null; + moq_text: string | null; + moq_value: number | null; + location: string | null; + badges: string[]; + sales_text: string | null; + return_rate_text: string | null; + source_url: string; + fetched_at: string; + strategy: string; +} + const SEARCH_ITEM_URL_PATTERNS = [ 'detail.1688.com/offer/', 'detail.m.1688.com/page/index.html?offerId=', ]; +const MAX_SEARCH_PAGES = 12; function normalizeSearchCandidate( candidate: NonNullable[number], - rank: number, sourceUrl: string, -): Record { - const itemUrl = cleanText(candidate.item_url); +): SearchRow { + const canonicalItemUrl = canonicalizeItemUrl(cleanText(candidate.item_url)); const containerText = cleanText(candidate.container_text); const priceText = firstNonEmpty([ normalizeInlineText(candidate.price_text), @@ -62,7 +91,7 @@ function normalizeSearchCandidate( normalizeInlineText(extractMoqText(candidate.hover_price_text)), normalizeInlineText(extractMoqText(containerText)), ])); - const sellerUrl = cleanText(candidate.seller_url); + const canonicalSellerUrl = canonicalizeSellerUrl(cleanText(candidate.seller_url)); const evidenceText = uniqueNonEmpty([ containerText, ...(candidate.desc_rows ?? []), @@ -72,31 +101,33 @@ function normalizeSearchCandidate( const badges = extractBadges(evidenceText, [...FACTORY_BADGE_PATTERNS, ...SERVICE_BADGE_PATTERNS]); const salesText = firstNonEmpty([ extractSalesText(candidate.sales_text), - extractSalesText(containerText) ?? '', - ]) || null; + extractSalesText(containerText), + ]); + const returnRateText = extractReturnRateText([...(candidate.tag_items ?? []), ...(candidate.hover_items ?? [])]); + const provenance = buildProvenance(sourceUrl); return { - rank, - offer_id: extractOfferId(itemUrl), - member_id: extractMemberId(sellerUrl), - shop_id: extractShopId(sellerUrl), - title: cleanText(candidate.title) || firstLine(containerText), - source_url: sourceUrl, - fetched_at: new Date().toISOString(), - strategy: 'cookie', - price_text: priceRange.price_text, + rank: 0, + offer_id: extractOfferId(canonicalItemUrl ?? '') ?? null, + member_id: extractMemberId(canonicalSellerUrl ?? '') ?? null, + shop_id: extractShopId(canonicalSellerUrl ?? '') ?? null, + title: cleanText(candidate.title) || firstLine(containerText) || null, + item_url: canonicalItemUrl, + seller_name: cleanText(candidate.seller_name) || null, + seller_url: canonicalSellerUrl, + price_text: priceRange.price_text || null, price_min: priceRange.price_min, price_max: priceRange.price_max, - currency: priceRange.currency ?? 'CNY', - moq_text: moq.moq_text, + currency: priceRange.currency, + moq_text: moq.moq_text || null, moq_value: moq.moq_value, - seller_name: cleanText(candidate.seller_name) || null, - seller_url: sellerUrl || null, - item_url: itemUrl, location: extractLocation(containerText), badges, - sales_text: salesText, - return_rate_text: extractReturnRateText(candidate.tag_items ?? []), + sales_text: salesText || null, + return_rate_text: returnRateText, + source_url: provenance.source_url, + fetched_at: provenance.fetched_at, + strategy: provenance.strategy, }; } @@ -113,14 +144,14 @@ function extractPriceText(text: string | null | undefined): string { return normalized.match(/[¥$€]\s*\d+(?:\.\d+)?/)?.[0] ?? ''; } -function extractSalesText(text: string | null | undefined): string | null { +function extractSalesText(text: string | null | undefined): string { const normalized = normalizeInlineText(text); - if (!normalized) return null; + if (!normalized) return ''; if (/^\d+(?:\.\d+)?\+?\s*(件|套|个|单)$/.test(normalized)) { return normalized; } const match = normalized.match(/(?:已售|销量|售)\s*\d+(?:\.\d+)?\+?\s*(件|套|个|单)?/); - return match ? cleanText(match[0]) : null; + return match ? cleanText(match[0]) : ''; } function firstLine(text: string): string { @@ -145,15 +176,29 @@ function extractReturnRateText(values: string[]): string | null { ?? null; } -async function readSearchPayload(page: IPage, query: string): Promise { - const url = buildSearchUrl(query); +function buildDedupeKey(row: Pick): string | null { + if (row.offer_id) return `offer:${row.offer_id}`; + if (row.item_url) return `url:${row.item_url}`; + return null; +} + +async function readSearchPayload(page: IPage, url: string): Promise { const state = await gotoAndReadState(page, url, 2500, 'search'); - assertNotCaptcha(state, 'search'); + assertAuthenticatedState(state, 'search'); - return await page.evaluate(` + const payload = await page.evaluate(` (() => { const normalizeText = (value) => (value || '').replace(/\\s+/g, ' ').trim(); - const isItemHref = (href) => ${JSON.stringify(SEARCH_ITEM_URL_PATTERNS)}.some((pattern) => (href || '').includes(pattern)); + const normalizeUrl = (href) => { + if (!href) return ''; + try { + return new URL(href, window.location.href).toString(); + } catch { + return ''; + } + }; + const isItemHref = (href) => ${JSON.stringify(SEARCH_ITEM_URL_PATTERNS)} + .some((pattern) => (href || '').includes(pattern)); const uniqueTexts = (values) => [...new Set(values.map((value) => normalizeText(value)).filter(Boolean))]; const collectTexts = (root, selector) => uniqueTexts( Array.from(root.querySelectorAll(selector)).map((node) => node.innerText || node.textContent || ''), @@ -178,7 +223,14 @@ async function readSearchPayload(page: IPage, query: string): Promise { + let node = anchor; + while (node && node !== document.body) { + const text = normalizeText(node.innerText || node.textContent || ''); + if (text.length >= 40 && text.length <= 2000) { + return node; + } + node = node.parentElement; + } + return anchor; + }; const collectCandidates = () => { const anchors = Array.from(document.querySelectorAll('a')).filter((anchor) => isItemHref(anchor.href || '')); const seen = new Set(); const items = []; - - const pickContainer = (anchor) => { - let node = anchor; - while (node && node !== document.body) { - const text = normalizeText(node.innerText || node.textContent || ''); - if (text.length >= 40 && text.length <= 2000) { - return node; - } - node = node.parentElement; - } - return anchor; - }; - for (const anchor of anchors) { const href = anchor.href || ''; if (!href || seen.has(href)) continue; @@ -234,18 +284,85 @@ async function readSearchPayload(page: IPage, query: string): Promise { + const selectors = [ + 'a.fui-next:not(.disabled)', + 'a.next-pagination-item:not(.disabled)', + 'a[rel="next"]:not(.disabled)', + 'a[data-role="next"]:not(.disabled)', + ]; + for (const selector of selectors) { + const node = document.querySelector(selector); + if (!node) continue; + const href = normalizeUrl(node.getAttribute('href') || node.href || ''); + if (href) return href; + } + const textBased = Array.from(document.querySelectorAll('a')) + .find((node) => /下一页|next/i.test(normalizeText(node.textContent || ''))); + if (!textBased) return ''; + return normalizeUrl(textBased.getAttribute('href') || textBased.href || ''); + }; return { href: window.location.href, title: document.title || '', bodyText: document.body ? document.body.innerText || '' : '', + next_url: findNextUrl(), candidates: collectCandidates(), }; })() `) as SearchPayload; + + if (!payload || typeof payload !== 'object') { + throw new CommandExecutionError( + '1688 search page did not return a readable payload', + 'Open the same query in Chrome and verify the page is fully loaded before retrying.', + ); + } + + return payload; +} + +async function collectSearchRows(page: IPage, query: string, limit: number): Promise { + const rowsByKey = new Map(); + const seenPages = new Set(); + let nextUrl = buildSearchUrl(query); + let pageCount = 0; + + while (nextUrl && rowsByKey.size < limit && pageCount < MAX_SEARCH_PAGES) { + if (seenPages.has(nextUrl)) break; + seenPages.add(nextUrl); + pageCount += 1; + + const payload = await readSearchPayload(page, nextUrl); + const sourceUrl = cleanText(payload.href) || nextUrl; + const candidates = Array.isArray(payload.candidates) ? payload.candidates : []; + + for (const candidate of candidates) { + const row = normalizeSearchCandidate(candidate, sourceUrl); + const dedupeKey = buildDedupeKey(row); + if (!dedupeKey || rowsByKey.has(dedupeKey)) continue; + rowsByKey.set(dedupeKey, row); + if (rowsByKey.size >= limit) break; + } + + const candidateNextUrl = cleanText(payload.next_url); + if (!candidateNextUrl || candidateNextUrl === sourceUrl) break; + nextUrl = candidateNextUrl; + } + + if (rowsByKey.size === 0) { + throw new EmptyResultError( + '1688 search', + 'No visible results were extracted. Retry with a different query or open the same search page in Chrome first.', + ); + } + + return [...rowsByKey.values()] + .slice(0, limit) + .map((row, index) => ({ ...row, rank: index + 1 })); } cli({ @@ -265,32 +382,15 @@ cli({ { name: 'limit', type: 'int', - default: 20, - help: '结果数量上限(默认 20)', + default: SEARCH_LIMIT_DEFAULT, + help: `结果数量上限(默认 ${SEARCH_LIMIT_DEFAULT},最大 ${SEARCH_LIMIT_MAX})`, }, ], columns: ['rank', 'title', 'price_text', 'moq_text', 'seller_name', 'location'], func: async (page, kwargs) => { const query = String(kwargs.query ?? ''); - const limit = Math.max(1, Number(kwargs.limit) || 20); - const payload = await readSearchPayload(page, query); - const sourceUrl = cleanText(payload.href) || buildSearchUrl(query); - const candidates = limitCandidates(payload.candidates ?? [], limit) - .filter((candidate) => cleanText(candidate.item_url)); - - if (candidates.length === 0) { - throw new CommandExecutionError( - '1688 search did not expose any result cards', - 'The search page likely hit a slider challenge or changed its DOM. Open the same query in Chrome, solve any challenge, keep a clean 1688 tab selected, and retry.', - ); - } - - const provenance = buildProvenance(sourceUrl); - return candidates.map((candidate, index) => ({ - ...normalizeSearchCandidate(candidate, index + 1, sourceUrl), - fetched_at: provenance.fetched_at, - strategy: provenance.strategy, - })); + const limit = parseSearchLimit(kwargs.limit); + return collectSearchRows(page, query, limit); }, }); @@ -299,4 +399,5 @@ export const __test__ = { extractMoqText, extractSalesText, firstLine, + buildDedupeKey, }; diff --git a/src/clis/1688/shared.test.ts b/src/clis/1688/shared.test.ts index 3395fc1d..e0cfdbbd 100644 --- a/src/clis/1688/shared.test.ts +++ b/src/clis/1688/shared.test.ts @@ -2,13 +2,18 @@ import { describe, expect, it } from 'vitest'; import { __test__ } from './shared.js'; describe('1688 shared helpers', () => { - it('builds encoded search URLs', () => { + it('builds encoded search URLs and validates limit', () => { expect(__test__.buildSearchUrl('置物架')).toBe( 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=%E7%BD%AE%E7%89%A9%E6%9E%B6', ); + expect(() => __test__.buildSearchUrl(' ')).toThrowError(/cannot be empty/i); + + expect(__test__.parseSearchLimit(3)).toBe(3); + expect(__test__.parseSearchLimit('1000')).toBe(__test__.SEARCH_LIMIT_MAX); + expect(() => __test__.parseSearchLimit('0')).toThrowError(/positive integer/i); }); - it('extracts stable ids from 1688 inputs', () => { + it('extracts IDs and canonicalizes urls', () => { expect(__test__.extractOfferId('887904326744')).toBe('887904326744'); expect(__test__.extractOfferId('https://detail.1688.com/offer/887904326744.html')).toBe('887904326744'); expect(__test__.extractMemberId('https://winport.m.1688.com/page/index.html?memberId=b2b-1641351767')).toBe('b2b-1641351767'); @@ -16,6 +21,15 @@ describe('1688 shared helpers', () => { expect(__test__.resolveStoreUrl('b2b-22154705262941f196')).toBe( 'https://winport.m.1688.com/page/index.html?memberId=b2b-22154705262941f196', ); + expect(__test__.canonicalizeStoreUrl('https://yinuoweierfushi.1688.com/page/index.html?spm=foo')).toBe( + 'https://yinuoweierfushi.1688.com', + ); + expect(__test__.canonicalizeItemUrl('http://detail.m.1688.com/page/index.html?offerId=910933345396&spm=x')).toBe( + 'https://detail.1688.com/offer/910933345396.html', + ); + expect(__test__.canonicalizeSellerUrl('https://yinuoweierfushi.1688.com/page/contactinfo.html?tracelog=1')).toBe( + 'https://yinuoweierfushi.1688.com', + ); expect(__test__.extractShopId('https://yinuoweierfushi.1688.com/page/index.html')).toBe('yinuoweierfushi'); }); @@ -45,15 +59,17 @@ describe('1688 shared helpers', () => { }); }); - it('extracts location and captcha states', () => { + it('detects captcha and login states', () => { expect(__test__.extractLocation('山东青岛 送至 江苏苏州')).toBe('山东青岛'); - expect(__test__.extractMetric(`主营:家装建材 -地址:江苏省常州市武进区横林镇崔桥崔卫路40号`, '主营')).toBe('家装建材'); - expect(__test__.extractMetric('常州市优品诺家居科技有限公司是家居用品、家居用品等产品专业生产加工的公司', '生产加工')).toBe(null); expect(__test__.isCaptchaState({ href: 'https://s.1688.com/_____tmd_____/punish', title: '验证码拦截', body_text: '请拖动下方滑块完成验证', })).toBe(true); + expect(__test__.isLoginState({ + href: 'https://login.taobao.com/member/login.jhtml', + title: '账号登录', + body_text: '请登录后继续', + })).toBe(true); }); }); diff --git a/src/clis/1688/shared.ts b/src/clis/1688/shared.ts index b4a904bc..6fcfb77f 100644 --- a/src/clis/1688/shared.ts +++ b/src/clis/1688/shared.ts @@ -7,7 +7,22 @@ export const SEARCH_URL_PREFIX = 'https://s.1688.com/selloffer/offer_search.htm? export const DETAIL_URL_PREFIX = 'https://detail.1688.com/offer/'; export const STORE_MOBILE_URL_PREFIX = 'https://winport.m.1688.com/page/index.html?memberId='; export const STRATEGY = 'cookie'; - +export const SEARCH_LIMIT_DEFAULT = 20; +export const SEARCH_LIMIT_MAX = 100; + +const STORE_GENERIC_HOSTS = new Set(['www', 'detail', 's', 'winport', 'work', 'air', 'dj']); +const TRACKING_QUERY_KEYS = new Set([ + 'spm', + 'tracelog', + 'clickid', + 'source', + 'scene', + 'from', + 'src', + 'ns', + 'cna', + 'pvid', +]); const CAPTCHA_URL_MARKER = '/_____tmd_____/punish'; const CAPTCHA_TEXT_PATTERNS = [ '请拖动下方滑块完成验证', @@ -17,6 +32,18 @@ const CAPTCHA_TEXT_PATTERNS = [ '访问验证', '滑动验证', ]; +const LOGIN_TEXT_PATTERNS = [ + '请登录', + '登录后', + '账号登录', + '手机登录', + '立即登录', + '扫码登录', + '请先完成登录', + '请先登录后查看', +]; +const LOGIN_URL_PATTERNS = ['/member/login', 'passport', 'login.taobao.com', 'account.1688.com']; + export const FACTORY_BADGE_PATTERNS = [ '源头工厂', '深度验厂', @@ -41,6 +68,7 @@ export const SERVICE_BADGE_PATTERNS = [ '包邮', '闪电拿样', ]; + const CHINA_LOCATIONS = [ '北京', '天津', @@ -139,10 +167,24 @@ export function uniqueNonEmpty(values: Array): string return [...new Set(values.map((value) => cleanText(value)).filter(Boolean))]; } +export function parseSearchLimit(input: unknown): number { + const parsed = Number.parseInt(String(input ?? SEARCH_LIMIT_DEFAULT), 10); + if (!Number.isFinite(parsed) || parsed < 1) { + throw new ArgumentError( + '1688 search --limit must be a positive integer', + 'Example: opencli 1688 search "桌面置物架" --limit 20', + ); + } + return Math.min(SEARCH_LIMIT_MAX, parsed); +} + export function buildSearchUrl(query: string): string { const normalized = cleanText(query); if (!normalized) { - throw new ArgumentError('1688 search query cannot be empty'); + throw new ArgumentError( + '1688 search query cannot be empty', + 'Example: opencli 1688 search "桌面置物架" --limit 20', + ); } return `${SEARCH_URL_PREFIX}${encodeURIComponent(normalized)}`; } @@ -161,11 +203,10 @@ export function buildDetailUrl(input: string): string { export function resolveStoreUrl(input: string): string { const normalized = cleanText(input); if (!normalized) { - throw new ArgumentError('1688 store expects a store URL, shop host, or member ID'); - } - - if (/^https?:\/\//i.test(normalized)) { - return canonicalizeStoreUrl(normalized); + throw new ArgumentError( + '1688 store expects a store URL or member ID', + 'Example: opencli 1688 store https://yinuoweierfushi.1688.com/', + ); } const memberId = extractMemberId(normalized); @@ -173,6 +214,10 @@ export function resolveStoreUrl(input: string): string { return `${STORE_MOBILE_URL_PREFIX}${memberId}`; } + if (/^https?:\/\//i.test(normalized)) { + return canonicalizeStoreUrl(normalized); + } + if (normalized.endsWith('.1688.com')) { return canonicalizeStoreUrl(`https://${normalized}`); } @@ -182,21 +227,50 @@ export function resolveStoreUrl(input: string): string { } throw new ArgumentError( - '1688 store expects a store URL, shop host, or member ID', - 'Example: opencli 1688 store https://yinuoweierfushi.1688.com/?offerId=887904326744', + '1688 store expects a store URL or member ID', + 'Example: opencli 1688 store b2b-22154705262941f196', ); } export function canonicalizeStoreUrl(input: string): string { - try { - const url = new URL(input); - if (!url.hostname.endsWith('1688.com')) { - throw new Error('not-1688'); - } - return url.toString(); - } catch { - throw new ArgumentError('Invalid 1688 store URL'); + const url = parse1688Url(input); + const memberId = extractMemberId(url.toString()); + if (memberId) { + return `${STORE_MOBILE_URL_PREFIX}${memberId}`; + } + + const host = normalizeStoreHost(url.hostname); + if (!host) { + throw new ArgumentError( + 'Invalid 1688 store URL', + 'Example: opencli 1688 store https://yinuoweierfushi.1688.com/', + ); + } + return `https://${host}`; +} + +export function canonicalizeItemUrl(input: string): string | null { + const offerId = extractOfferId(input); + if (offerId) { + return `${DETAIL_URL_PREFIX}${offerId}.html`; + } + const url = parse1688UrlOrNull(input); + if (!url) return null; + stripTrackingParams(url); + url.hash = ''; + return url.toString(); +} + +export function canonicalizeSellerUrl(input: string): string | null { + const memberId = extractMemberId(input); + if (memberId) { + return `${STORE_MOBILE_URL_PREFIX}${memberId}`; } + const url = parse1688UrlOrNull(input); + if (!url) return null; + const host = normalizeStoreHost(url.hostname); + if (!host) return null; + return `https://${host}`; } export function extractOfferId(input: string): string | null { @@ -226,13 +300,12 @@ export function extractMemberId(input: string): string | null { export function extractShopId(input: string): string | null { const normalized = cleanText(input); if (!normalized) return null; + try { const url = new URL(/^https?:\/\//i.test(normalized) ? normalized : `https://${normalized}`); - const [subdomain] = url.hostname.split('.'); - if (!subdomain || ['www', 'detail', 's', 'winport', 'work'].includes(subdomain)) { - return null; - } - return subdomain; + const host = normalizeStoreHost(url.hostname); + if (!host) return null; + return host.split('.')[0] ?? null; } catch { return /^[a-z0-9-]+$/i.test(normalized) ? normalized : null; } @@ -280,9 +353,7 @@ export function normalizePriceTiers( const priceText = cleanText(tier.price); const price = toNumber(tier.price); return { - quantity_text: quantityMin !== null - ? `${quantityMin}${unit ?? ''}` - : '', + quantity_text: quantityMin !== null ? `${quantityMin}${unit ?? ''}` : '', quantity_min: quantityMin, price_text: priceText, price, @@ -359,9 +430,7 @@ export function extractMainBusiness(text: string): string | null { } export function extractBadges(text: string, candidates: string[]): string[] { - return uniqueNonEmpty( - candidates.filter((candidate) => cleanMultilineText(text).includes(candidate)), - ); + return uniqueNonEmpty(candidates.filter((candidate) => cleanMultilineText(text).includes(candidate))); } export function guessTopCategories(text: string): string[] { @@ -378,6 +447,14 @@ export function isCaptchaState(state: Partial): boolean { return CAPTCHA_TEXT_PATTERNS.some((pattern) => title.includes(pattern) || bodyText.includes(pattern)); } +export function isLoginState(state: Partial): boolean { + const href = cleanText(state.href).toLowerCase(); + const title = cleanText(state.title); + const bodyText = cleanMultilineText(state.body_text); + if (LOGIN_URL_PATTERNS.some((pattern) => href.includes(pattern))) return true; + return LOGIN_TEXT_PATTERNS.some((pattern) => title.includes(pattern) || bodyText.includes(pattern)); +} + export function buildCaptchaHint(action: string): string { return [ `Open a clean 1688 ${action} page in the shared Chrome profile and finish any slider challenge first.`, @@ -428,48 +505,17 @@ export async function gotoAndReadState( } export async function ensure1688Session(page: IPage): Promise { - const state = await gotoAndReadState(page, HOME_URL, 1500); - if (isCaptchaState(state)) { - throw new CommandExecutionError( - '1688 homepage is currently blocked by a slider challenge', - buildCaptchaHint('homepage'), - ); - } - - const authState = await page.evaluate(` - (() => { - const text = document.body ? document.body.innerText || '' : ''; - const hasSearchInput = !!document.querySelector('input#alisearch-input, input[name="keywords"]'); - const hasLoggedMarker = ['采购车', '收藏的品', '我的足迹', '全部订单'] - .some((label) => text.includes(label)); - const hasLoginPrompt = ['请登录', '立即登录', '登录后'] - .some((label) => text.includes(label)); - return { - hasSearchInput, - hasLoggedMarker, - hasLoginPrompt, - }; - })() - `) as { hasSearchInput?: boolean; hasLoggedMarker?: boolean; hasLoginPrompt?: boolean }; - - const isLoggedIn = authState.hasSearchInput === true - && authState.hasLoggedMarker === true - && authState.hasLoginPrompt !== true; + const state = await gotoAndReadState(page, HOME_URL, 1500, 'homepage'); + assertAuthenticatedState(state, 'homepage'); +} - if (!isLoggedIn) { - throw new AuthRequiredError( - '1688.com', - '1688 is not logged in in the shared Chrome profile', - ); - } +export function assertAuthenticatedState(state: PageState, action: string): void { + if (!isCaptchaState(state) && !isLoginState(state)) return; + throw new AuthRequiredError('1688.com', `请先在共享 Chrome 完成 1688 登录/验证,再重试(${action})`); } export function assertNotCaptcha(state: PageState, action: string): void { - if (!isCaptchaState(state)) return; - throw new CommandExecutionError( - `1688 ${action} hit a slider challenge`, - buildCaptchaHint(action), - ); + assertAuthenticatedState(state, action); } export function toNumber(value: unknown): number | null { @@ -502,10 +548,59 @@ function escapeForRegex(value: string): string { return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } +function parse1688Url(input: string): URL { + const normalized = cleanText(input); + try { + const url = new URL(normalized); + if (!url.hostname.endsWith('.1688.com') && url.hostname !== '1688.com' && url.hostname !== 'www.1688.com') { + throw new Error('invalid-host'); + } + stripTrackingParams(url); + url.hash = ''; + return url; + } catch { + throw new ArgumentError( + 'Invalid 1688 URL', + 'Use a URL under 1688.com (for example: https://detail.1688.com/offer/887904326744.html)', + ); + } +} + +function parse1688UrlOrNull(input: string): URL | null { + try { + return parse1688Url(input); + } catch { + return null; + } +} + +function normalizeStoreHost(hostname: string): string | null { + const lower = cleanText(hostname).toLowerCase(); + if (!lower.endsWith('.1688.com')) return null; + const [subdomain] = lower.split('.'); + if (!subdomain || STORE_GENERIC_HOSTS.has(subdomain)) return null; + return lower; +} + +function stripTrackingParams(url: URL): void { + const keys = [...url.searchParams.keys()]; + for (const key of keys) { + if (TRACKING_QUERY_KEYS.has(key) || key.toLowerCase().startsWith('utm_')) { + url.searchParams.delete(key); + } + } +} + export const __test__ = { + SEARCH_LIMIT_DEFAULT, + SEARCH_LIMIT_MAX, + parseSearchLimit, buildSearchUrl, buildDetailUrl, resolveStoreUrl, + canonicalizeStoreUrl, + canonicalizeItemUrl, + canonicalizeSellerUrl, extractOfferId, extractMemberId, extractShopId, @@ -520,6 +615,7 @@ export const __test__ = { extractBadges, guessTopCategories, isCaptchaState, + isLoginState, cleanText, cleanMultilineText, uniqueNonEmpty, diff --git a/src/clis/1688/store.test.ts b/src/clis/1688/store.test.ts index a005a65e..6da43135 100644 --- a/src/clis/1688/store.test.ts +++ b/src/clis/1688/store.test.ts @@ -36,7 +36,7 @@ describe('1688 store normalization', () => { seller: { companyName: '青岛沁澜衣品服装有限公司', memberId: 'b2b-1641351767', - winportUrl: 'https://yinuoweierfushi.1688.com', + winportUrl: 'https://yinuoweierfushi.1688.com/page/index.html?spm=abc', }, services: [{ serviceName: '延期必赔' }, { serviceName: '品质保障' }], }, @@ -47,39 +47,23 @@ describe('1688 store normalization', () => { expect(result.company_url).toBe('https://yinuoweierfushi.1688.com/page/contactinfo.html'); expect(result.years_on_platform_text).toBe('入驻13年'); expect(result.location).toBe('山东省青岛市即墨区环秀街道办事处湘江二路97号甲'); - expect(result.return_rate_text).toBe('87%'); + expect(result.return_rate_text).toContain('87%'); expect(result.top_categories).toEqual(['大码女装']); expect(result.service_badges).toEqual(['延期必赔', '品质保障']); }); it('builds contact urls and extracts offer ids', () => { + expect(__test__.safeCanonicalStoreUrl('https://yinuoweierfushi.1688.com/page/index.html?spm=foo')).toBe( + 'https://yinuoweierfushi.1688.com', + ); expect(__test__.buildContactUrl('https://yinuoweierfushi.1688.com')).toBe( 'https://yinuoweierfushi.1688.com/page/contactinfo.html', ); expect(__test__.firstOfferId([ 'https://detail.1688.com/offer/887904326744.html', ])).toBe('887904326744'); - }); - - it('collects deduplicated offer ids from input and store links', () => { - expect(__test__.collectOfferIds( - 'https://detail.1688.com/offer/887904326744.html', - { - href: 'https://yinuoweierfushi.1688.com/page/index.html', - bodyText: '', - offerLinks: [ - 'https://detail.1688.com/offer/887904326744.html', - 'https://detail.1688.com/offer/123456789012.html', - ], - }, - { - href: 'https://yinuoweierfushi.1688.com/page/contactinfo.html', - bodyText: '', - offerLinks: [ - 'https://detail.1688.com/offer/123456789012.html', - 'https://detail.1688.com/offer/999999999999.html', - ], - }, - )).toEqual(['887904326744', '123456789012', '999999999999']); + expect(__test__.firstContactUrl([ + 'https://yinuoweierfushi.1688.com/page/contactinfo.html?spm=1', + ])).toBe('https://yinuoweierfushi.1688.com/page/contactinfo.html'); }); }); diff --git a/src/clis/1688/store.ts b/src/clis/1688/store.ts index da7024e4..ce21b637 100644 --- a/src/clis/1688/store.ts +++ b/src/clis/1688/store.ts @@ -1,18 +1,18 @@ -import { CommandExecutionError } from '../../errors.js'; +import { CommandExecutionError, EmptyResultError } from '../../errors.js'; import { cli, Strategy } from '../../registry.js'; import type { IPage } from '../../types.js'; import { FACTORY_BADGE_PATTERNS, SERVICE_BADGE_PATTERNS, - assertNotCaptcha, - buildCaptchaHint, + assertAuthenticatedState, buildDetailUrl, buildProvenance, + canonicalizeSellerUrl, + canonicalizeStoreUrl, cleanMultilineText, cleanText, extractAddress, extractBadges, - extractMainBusiness, extractMemberId, extractMetric, extractOfferId, @@ -44,22 +44,6 @@ interface StoreItemSeed { services?: Array<{ serviceName?: string }>; } -function collectOfferIds( - rawInput: string, - storePayload: StoreBrowserPayload | null, - contactPayload: StoreBrowserPayload | null, -): string[] { - const ids = uniqueNonEmpty([ - rawInput, - ...(storePayload?.offerLinks ?? []), - ...(contactPayload?.offerLinks ?? []), - ]) - .map((value) => extractOfferId(value)) - .filter((value): value is string => Boolean(value)); - - return [...new Set(ids)]; -} - function normalizeStorePayload(input: { resolvedUrl: string; storePayload: StoreBrowserPayload | null; @@ -76,23 +60,25 @@ function normalizeStorePayload(input: { const seedText = cleanMultilineText(seed?.bodyText); const combinedText = [contactText, storeText, seedText].filter(Boolean).join('\n'); - const sellerUrl = cleanText( + const sellerUrlRaw = cleanText( seed?.seller?.winportUrl ?? seed?.seller?.sellerWinportUrlMap?.defaultUrl ?? storePayload?.href ?? input.resolvedUrl, ); + const storeUrl = safeCanonicalStoreUrl(sellerUrlRaw || input.resolvedUrl) ?? input.resolvedUrl; + const sellerUrl = canonicalizeSellerUrl(sellerUrlRaw) ?? storeUrl; + const companyUrl = pickCompanyUrl(contactPayload?.href, storeUrl); const memberId = cleanText(seed?.seller?.memberId) || input.explicitMemberId || extractMemberId(input.resolvedUrl) + || extractMemberId(storePayload?.href ?? '') || null; - const shopId = extractShopId(sellerUrl) ?? extractShopId(input.resolvedUrl); + const shopId = extractShopId(sellerUrl) ?? extractShopId(storeUrl); const companyName = cleanText(seed?.seller?.companyName) || firstNamedLine(contactText) || firstNamedLine(storeText) || null; - const storeUrl = canonicalStoreUrl(sellerUrl || input.resolvedUrl); - const companyUrl = buildContactUrl(storeUrl) ?? storeUrl; const serviceBadges = uniqueNonEmpty([ ...extractBadges(combinedText, SERVICE_BADGE_PATTERNS), ...((seed?.services ?? []).map((service) => cleanText(service.serviceName))), @@ -106,7 +92,6 @@ function normalizeStorePayload(input: { store_url: storeUrl, company_name: companyName, company_url: companyUrl, - ...buildProvenance(contactPayload?.href || storePayload?.href || input.resolvedUrl), business_model_text: firstMetric(combinedText, ['经营模式', '生产加工', '主营产品']), years_on_platform_text: extractYearsOnPlatform(combinedText), location: extractAddress(contactText) ?? extractAddress(storeText), @@ -118,21 +103,31 @@ function normalizeStorePayload(input: { top_categories: guessTopCategories(combinedText), phone_text: extractMetric(contactText, '电话'), mobile_text: extractMetric(contactText, '手机'), + ...buildProvenance(cleanText(contactPayload?.href) || cleanText(storePayload?.href) || input.resolvedUrl), }; } -function canonicalStoreUrl(url: string): string { +function safeCanonicalStoreUrl(url: string): string | null { try { - const parsed = new URL(url); - return `${parsed.protocol}//${parsed.hostname}`; + return canonicalizeStoreUrl(url); } catch { - return url; + return null; } } +function pickCompanyUrl(contactHref: string | undefined, storeUrl: string): string | null { + const fromPage = cleanText(contactHref); + if (fromPage) { + const normalized = buildContactUrl(fromPage); + if (normalized) return normalized; + } + return buildContactUrl(storeUrl); +} + function buildContactUrl(storeUrl: string): string | null { try { const parsed = new URL(storeUrl); + if (!parsed.hostname.endsWith('.1688.com')) return null; return `${parsed.protocol}//${parsed.hostname}/page/contactinfo.html`; } catch { return null; @@ -157,9 +152,10 @@ function firstMetric(text: string, labels: string[]): string | null { function extractReturnRate(text: string): string | null { const inline = text.match(/回头率\s*([0-9.]+%)/); - if (inline) return inline[1]; - const multiline = text.match(/回头率\n([0-9.]+%)/); - return multiline ? multiline[1] : null; + if (inline) return cleanText(inline[0]); + const multiline = text.match(/回头率\s*\n\s*([0-9.]+%)/); + if (!multiline) return null; + return `回头率${cleanText(multiline[1])}`; } function firstOfferId(links: string[]): string | null { @@ -170,20 +166,24 @@ function firstOfferId(links: string[]): string | null { return null; } -async function readStorePayload( - page: IPage, - url: string, - action: string, -): Promise { +function firstContactUrl(links: string[]): string | null { + for (const link of links) { + const url = buildContactUrl(link); + if (url) return url; + } + return null; +} + +async function readStorePayload(page: IPage, url: string, action: string): Promise { const state = await gotoAndReadState(page, url, 2500, action); - assertNotCaptcha(state, action); + assertAuthenticatedState(state, action); return await page.evaluate(` (() => ({ href: window.location.href, title: document.title || '', bodyText: document.body ? document.body.innerText || '' : '', - offerLinks: Array.from(document.querySelectorAll('a[href*="detail.1688.com/offer/"]')) + offerLinks: Array.from(document.querySelectorAll('a[href*="detail.1688.com/offer/"], a[href*="offerId="]')) .map((anchor) => anchor.href) .filter(Boolean), contactLinks: Array.from(document.querySelectorAll('a[href*="contactinfo"]')) @@ -193,13 +193,10 @@ async function readStorePayload( `) as StoreBrowserPayload; } -async function readItemSeed( - page: IPage, - offerId: string, -): Promise { +async function readItemSeed(page: IPage, offerId: string): Promise { const itemUrl = buildDetailUrl(offerId); const state = await gotoAndReadState(page, itemUrl, 2500, 'store seed item'); - assertNotCaptcha(state, 'store seed item'); + assertAuthenticatedState(state, 'store seed item'); const seed = await page.evaluate(` (() => { @@ -214,28 +211,25 @@ async function readItemSeed( })() `) as StoreItemSeed; - if (!cleanText(seed.href) || !seed.seller) { + const hasSellerContext = !!cleanText(seed?.seller?.memberId) || !!cleanText(seed?.seller?.winportUrl); + if (!hasSellerContext) { throw new CommandExecutionError( '1688 store seed item did not expose seller context', - `${buildCaptchaHint('item')} Open a real 1688 item page in Chrome and retry.`, + '当前 tab 非商品详情上下文,请切到 detail.1688.com 商品页并重试', ); } return seed; } -async function readFirstUsableItemSeed( - page: IPage, - offerIds: string[], -): Promise { - for (const offerId of offerIds.slice(0, 8)) { - try { - return await readItemSeed(page, offerId); - } catch (err) { - if (!(err instanceof CommandExecutionError)) throw err; - } - } - return null; +function hasAnyEvidence( + storePayload: StoreBrowserPayload | null, + contactPayload: StoreBrowserPayload | null, + seed: StoreItemSeed | null, +): boolean { + return !!cleanText(storePayload?.bodyText) + || !!cleanText(contactPayload?.bodyText) + || !!cleanText(seed?.bodyText); } cli({ @@ -250,22 +244,37 @@ cli({ name: 'input', required: true, positional: true, - help: '1688 店铺 URL、店铺 host 或 member ID(如 b2b-22154705262941f196)', + help: '1688 店铺 URL 或 member ID(如 b2b-22154705262941f196)', }, ], - columns: ['company_name', 'years_on_platform_text', 'location', 'return_rate_text'], + columns: ['store_name', 'years_on_platform_text', 'location', 'return_rate_text'], func: async (page, kwargs) => { const rawInput = String(kwargs.input ?? ''); const resolvedUrl = resolveStoreUrl(rawInput); const explicitMemberId = extractMemberId(rawInput); const storePayload = await readStorePayload(page, resolvedUrl, 'store'); - const contactUrl = buildContactUrl(storePayload.href || resolvedUrl); + const contactUrl = firstContactUrl(storePayload.contactLinks ?? []) || buildContactUrl(storePayload.href || resolvedUrl); const contactPayload = contactUrl ? await readStorePayload(page, contactUrl, 'store contact') : null; - const seed = await readFirstUsableItemSeed( - page, - collectOfferIds(rawInput, storePayload, contactPayload), - ); + const offerId = extractOfferId(rawInput) + || firstOfferId(storePayload.offerLinks ?? []) + || firstOfferId(contactPayload?.offerLinks ?? []); + + let seed: StoreItemSeed | null = null; + if (offerId) { + try { + seed = await readItemSeed(page, offerId); + } catch (error) { + if (!(error instanceof CommandExecutionError)) throw error; + } + } + + if (!hasAnyEvidence(storePayload, contactPayload, seed)) { + throw new EmptyResultError( + '1688 store', + 'Store page is reachable but no visible fields were extracted. Open the store page in Chrome and retry.', + ); + } return [ normalizeStorePayload({ @@ -281,11 +290,11 @@ cli({ export const __test__ = { normalizeStorePayload, - canonicalStoreUrl, + safeCanonicalStoreUrl, buildContactUrl, firstNamedLine, firstMetric, extractReturnRate, firstOfferId, - collectOfferIds, + firstContactUrl, };