diff --git a/README.md b/README.md index b4492f2a..c6cab43e 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,7 @@ git clone git@github.com:jackwener/opencli.git && cd opencli && npm install && n | **twitter** | `trending` `search` `timeline` `bookmarks` `post` `download` `profile` `article` `like` `likes` `notifications` `reply` `reply-dm` `thread` `follow` `unfollow` `followers` `following` `block` `unblock` `bookmark` `unbookmark` `delete` `hide-reply` `accept` | | **reddit** | `hot` `frontpage` `popular` `search` `subreddit` `user` `user-posts` `user-comments` `read` `save` `saved` `subscribe` `upvote` `upvoted` `comment` | | **amazon** | `bestsellers` `search` `product` `offer` `discussion` | +| **1688** | `search` `item` `store` | | **gemini** | `new` `ask` `image` | | **notebooklm** | `status` `list` `open` `select` `current` `get` `metadata` `source-list` `source-get` `source-fulltext` `source-guide` `history` `note-list` `notes-list` `notes-get` `summary` | | **spotify** | `auth` `status` `play` `pause` `next` `prev` `volume` `search` `queue` `shuffle` `repeat` | diff --git a/README.zh-CN.md b/README.zh-CN.md index 6f5d7d3d..e7a46e03 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -191,6 +191,7 @@ npx skills add jackwener/opencli --skill opencli-oneshot # 快速命令参 | **facebook** | `feed` `profile` `search` `friends` `groups` `events` `notifications` `memories` `add-friend` `join-group` | 浏览器 | | **google** | `news` `search` `suggest` `trends` | 公开 | | **amazon** | `bestsellers` `search` `product` `offer` `discussion` | 浏览器 | +| **1688** | `search` `item` `store` | 浏览器 | | **gemini** | `new` `ask` `image` | 浏览器 | | **spotify** | `auth` `status` `play` `pause` `next` `prev` `volume` `search` `queue` `shuffle` `repeat` | OAuth API | | **notebooklm** | `status` `list` `open` `select` `current` `get` `metadata` `source-list` `source-get` `source-fulltext` `source-guide` `history` `note-list` `notes-list` `notes-get` `summary` | 浏览器 | diff --git a/docs/.vitepress/config.mts b/docs/.vitepress/config.mts index 45da8876..7f68f8ea 100644 --- a/docs/.vitepress/config.mts +++ b/docs/.vitepress/config.mts @@ -73,6 +73,7 @@ export default defineConfig({ { text: 'Chaoxing', link: '/adapters/browser/chaoxing' }, { text: 'Grok', link: '/adapters/browser/grok' }, { text: 'Amazon', link: '/adapters/browser/amazon' }, + { text: '1688', link: '/adapters/browser/1688' }, { text: 'Gemini', link: '/adapters/browser/gemini' }, { text: 'NotebookLM', link: '/adapters/browser/notebooklm' }, { text: 'WeRead', link: '/adapters/browser/weread' }, diff --git a/docs/adapters/browser/1688.md b/docs/adapters/browser/1688.md new file mode 100644 index 00000000..ba82ec9e --- /dev/null +++ b/docs/adapters/browser/1688.md @@ -0,0 +1,52 @@ +# 1688 + +**Mode**: 🔐 Browser · **Domain**: `1688.com` + +## Commands + +| Command | Description | +|---------|-------------| +| `opencli 1688 search "" --limit ` | Search public product candidates with price, MOQ, seller link, and visible badges | +| `opencli 1688 item ` | Read a public product detail page with price tiers, MOQ, delivery text, and seller basics | +| `opencli 1688 store ` | Read a public supplier/store page with company info, years on platform, categories, and visible service signals | + +## Usage Examples + +```bash +# Search products +opencli 1688 search "桌面置物架 宿舍 收纳" --limit 10 + +# JSON output +opencli 1688 search "桌面置物架 宿舍 收纳" --limit 10 -f json + +# Read an item by offer id +opencli 1688 item 841141931191 -f json + +# Read an item by URL +opencli 1688 item https://detail.1688.com/offer/841141931191.html -f json + +# Read a supplier store +opencli 1688 store https://shop52908bfw19166.1688.com/ -f json + +# Read a supplier by member id +opencli 1688 store b2b-22154705262941f196 -f json +``` + +## Prerequisites + +- Chrome running and **logged into** `1688.com` +- [Browser Bridge extension](/guide/browser-bridge) installed + +## Notes + +- This adapter only returns fields visible on public pages. It does not send inquiries, place orders, or access seller back office data. +- Prefer stable identifiers such as `offer_id`, `member_id`, and `shop_id` for follow-up workflows. +- `search --limit` defaults to `20` and is capped at `100`. +- `search` deduplicates with key priority: `offer_id` first, then canonical `item_url`. +- `item` can be more sensitive to the active browser target than `search` or `store`. + +## Troubleshooting + +- If `opencli 1688 item` reports `did not expose product context`, first make sure the open page is a real `detail.1688.com` item page. +- If the browser target is too broad, retry with `OPENCLI_CDP_TARGET=detail.1688.com`. +- If you hit a slider or verification page, refresh the real page in Chrome and retry. diff --git a/docs/adapters/index.md b/docs/adapters/index.md index 5b5b4cc2..4e7c1611 100644 --- a/docs/adapters/index.md +++ b/docs/adapters/index.md @@ -45,6 +45,7 @@ Run `opencli list` for the live registry. | **[google](./browser/google)** | `news` `search` `suggest` `trends` | 🌐 / 🔐 | | **[jd](./browser/jd)** | `item` | 🔐 Browser | | **[amazon](./browser/amazon)** | `bestsellers` `search` `product` `offer` `discussion` | 🔐 Browser | +| **[1688](./browser/1688)** | `search` `item` `store` | 🔐 Browser | | **[web](./browser/web)** | `read` | 🔐 Browser | | **[weixin](./browser/weixin)** | `download` | 🔐 Browser | | **[36kr](./browser/36kr)** | `news` `hot` `search` `article` | 🌐 / 🔐 | diff --git a/docs/developer/testing.md b/docs/developer/testing.md index 730b6f30..f6712d57 100644 --- a/docs/developer/testing.md +++ b/docs/developer/testing.md @@ -131,6 +131,8 @@ npx vitest src/ - `browser-public.test.ts` 使用 `tryBrowserCommand()`,站点反爬或地域限制导致空数据时会 warn + pass - `browser-auth.test.ts` 验证 **graceful failure**,重点是不 crash、不 hang、错误信息可控 - 如需测试完整登录态,保持 Chrome 登录态并安装 Browser Bridge 扩展,再手动运行对应测试 +- 对依赖具体 host 页面上下文的 browser adapter,除了单测外,还应手动验证真实命令,并把必要的 target host 约束写进 adapter docs / troubleshooting +- 对会主动导航页面的 browser commands,手动验证时优先串行执行;多个 CLI 进程同时连到同一个 CDP target 可能互相覆盖导航,制造假的 adapter 故障 --- diff --git a/docs/guide/troubleshooting.md b/docs/guide/troubleshooting.md index 4b352c9f..9cb7b528 100644 --- a/docs/guide/troubleshooting.md +++ b/docs/guide/troubleshooting.md @@ -12,6 +12,17 @@ - Your login session in Chrome might have expired. Open a normal Chrome tab, navigate to the target site, and log in or refresh the page. - Some sites have geographic restrictions (e.g., Bilibili, Zhihu from outside China). +### Browser command opens the page but still cannot read context + +- A healthy Browser Bridge connection does not guarantee that the current page target exposes the data your adapter expects. +- Some browser adapters are sensitive to the active host or page context. +- Example: `opencli 1688 item` may fail with `did not expose product context` if the target is too broad. +- Retry on a real item page, refresh the page in Chrome, and if needed narrow the target, for example: + +```bash +OPENCLI_CDP_TARGET=detail.1688.com opencli 1688 item 841141931191 -f json +``` + ### Node API errors - Make sure you are using **Node.js >= 20**. Some dependencies require modern Node APIs. diff --git a/src/clis/1688/item.test.ts b/src/clis/1688/item.test.ts new file mode 100644 index 00000000..209847a0 --- /dev/null +++ b/src/clis/1688/item.test.ts @@ -0,0 +1,69 @@ +import { describe, expect, it } from 'vitest'; +import { __test__ } from './item.js'; + +describe('1688 item normalization', () => { + it('normalizes public item payload into contract fields', () => { + const result = __test__.normalizeItemPayload({ + href: 'https://detail.1688.com/offer/887904326744.html', + title: '法式春季长袖开衫连衣裙女新款大码女装碎花吊带裙套装142077 - 阿里巴巴', + bodyText: ` + 青岛沁澜衣品服装有限公司 + 入驻13年 + 主营:大码女装 + 店铺回头率 + 87% + 山东青岛 + 3套起批 + 已售1600+套 + 支持定制logo + `, + offerTitle: '法式春季长袖开衫连衣裙女新款大码女装碎花吊带裙套装142077', + offerId: 887904326744, + seller: { + companyName: '青岛沁澜衣品服装有限公司', + memberId: 'b2b-1641351767', + winportUrl: 'https://yinuoweierfushi.1688.com/page/index.html?spm=a1', + }, + trade: { + beginAmount: 3, + priceDisplay: '96.00-98.00', + unit: '套', + saleCount: 1655, + offerIDatacenterSellInfo: { + 面料名称: '莫代尔', + 主面料成分: '莫代尔纤维', + sellPointModel: '{"ignore":true}', + }, + offerPriceModel: { + currentPrices: [ + { beginAmount: 3, price: '98.00' }, + { beginAmount: 50, price: '97.00' }, + ], + }, + }, + gallery: { + mainImage: ['https://example.com/1.jpg'], + offerImgList: ['https://example.com/2.jpg'], + wlImageInfos: [{ fullPathImageURI: 'https://example.com/3.jpg' }], + }, + services: [ + { serviceName: '延期必赔', agreeDeliveryHours: 360 }, + { serviceName: '品质保障' }, + ], + }); + + expect(result.offer_id).toBe('887904326744'); + expect(result.member_id).toBe('b2b-1641351767'); + expect(result.shop_id).toBe('yinuoweierfushi'); + expect(result.seller_url).toBe('https://yinuoweierfushi.1688.com'); + expect(result.price_text).toBe('¥96.00-98.00'); + expect(result.moq_text).toBe('3套起批'); + expect(result.origin_place).toBe('山东青岛'); + expect(result.delivery_days_text).toBe('360小时内发货'); + expect(result.private_label_text).toBe('支持定制logo'); + expect(result.visible_attributes).toEqual([ + { key: '面料名称', value: '莫代尔' }, + { key: '主面料成分', value: '莫代尔纤维' }, + ]); + }); +}); diff --git a/src/clis/1688/item.ts b/src/clis/1688/item.ts new file mode 100644 index 00000000..79c03c3e --- /dev/null +++ b/src/clis/1688/item.ts @@ -0,0 +1,282 @@ +import { CommandExecutionError } from '../../errors.js'; +import { cli, Strategy } from '../../registry.js'; +import type { IPage } from '../../types.js'; +import { isRecord } from '../../utils.js'; +import { + assertAuthenticatedState, + buildDetailUrl, + buildProvenance, + canonicalizeSellerUrl, + cleanMultilineText, + cleanText, + extractLocation, + extractMemberId, + extractOfferId, + extractShopId, + gotoAndReadState, + normalizePriceTiers, + parseMoqText, + parsePriceText, + toNumber, + uniqueNonEmpty, +} from './shared.js'; + +interface BuyerProtectionModel { + serviceName?: string; + shortBuyerDesc?: string; + packageBuyerDesc?: string; + textDesc?: string; + agreeDeliveryHours?: number; +} + +interface ItemBrowserPayload { + href?: string; + title?: string; + bodyText?: string; + offerTitle?: string; + offerId?: string | number; + seller?: { + companyName?: string; + memberId?: string; + winportUrl?: string; + sellerWinportUrlMap?: Record; + }; + trade?: { + beginAmount?: string | number; + priceDisplay?: string; + unit?: string; + saleCount?: string | number; + offerIDatacenterSellInfo?: Record; + offerPriceModel?: { + currentPrices?: Array<{ beginAmount?: string | number; price?: string | number }>; + }; + }; + gallery?: { + mainImage?: string[]; + offerImgList?: string[]; + wlImageInfos?: Array<{ fullPathImageURI?: string }>; + }; + shipping?: { + deliveryLimitText?: string; + logisticsText?: string; + protectionInfos?: BuyerProtectionModel[]; + buyerProtectionModel?: BuyerProtectionModel[]; + }; + services?: BuyerProtectionModel[]; +} + +interface VisibleAttribute { + key: string; + value: string; +} + +function normalizeItemPayload(payload: ItemBrowserPayload): Record { + const href = cleanText(payload.href); + const bodyText = cleanMultilineText(payload.bodyText); + const sellerName = cleanText(payload.seller?.companyName); + const sellerUrlRaw = cleanText( + payload.seller?.winportUrl + ?? payload.seller?.sellerWinportUrlMap?.defaultUrl + ?? payload.seller?.sellerWinportUrlMap?.indexUrl, + ); + const sellerUrl = canonicalizeSellerUrl(sellerUrlRaw); + const offerId = cleanText(String(payload.offerId ?? '')) || extractOfferId(href) || null; + const memberId = cleanText(payload.seller?.memberId) || extractMemberId(sellerUrlRaw || href) || null; + const shopId = extractShopId(sellerUrl ?? href); + const unit = cleanText(payload.trade?.unit); + const priceDisplay = cleanText(payload.trade?.priceDisplay); + const priceRange = parsePriceText(priceDisplay ? `¥${priceDisplay}` : bodyText); + const moqText = extractMoqText(bodyText, payload.trade?.beginAmount, unit); + const moq = parseMoqText(moqText); + const services = uniqueServices(payload); + const serviceBadges = uniqueNonEmpty(services.map((service) => cleanText(service.serviceName))); + const attributes = normalizeVisibleAttributes(payload.trade?.offerIDatacenterSellInfo); + const priceTiers = normalizePriceTiers(payload.trade?.offerPriceModel?.currentPrices ?? [], unit || null); + const images = uniqueNonEmpty([ + ...(payload.gallery?.mainImage ?? []), + ...(payload.gallery?.offerImgList ?? []), + ...((payload.gallery?.wlImageInfos ?? []).map((item) => item.fullPathImageURI ?? '')), + ]); + + const detailUrl = offerId ? buildDetailUrl(offerId) : href; + const provenance = buildProvenance(href || detailUrl); + + return { + offer_id: offerId, + member_id: memberId, + shop_id: shopId, + title: cleanText(payload.offerTitle) || stripAlibabaSuffix(payload.title) || firstNonEmptyLine(bodyText) || null, + item_url: detailUrl, + main_images: images, + price_text: priceRange.price_text || null, + price_tiers: priceTiers, + currency: priceRange.currency, + moq_text: moq.moq_text || null, + moq_value: moq.moq_value, + seller_name: sellerName || null, + seller_url: sellerUrl, + shop_name: sellerName || null, + origin_place: extractLocation(bodyText), + delivery_days_text: extractDeliveryDaysText(bodyText, services, payload.shipping), + customization_text: extractKeywordLine(bodyText, ['来样定制', '来图定制', '支持定制', '可定制', '定制']), + private_label_text: extractKeywordLine(bodyText, ['贴牌', '贴标', '定制logo', '打logo', 'OEM', 'ODM']), + visible_attributes: attributes, + sales_text: extractSalesText(bodyText), + service_badges: serviceBadges, + stock_quantity: extractStockQuantity(bodyText), + ...provenance, + }; +} + +function normalizeVisibleAttributes(raw: unknown): VisibleAttribute[] { + if (!isRecord(raw)) return []; + return Object.entries(raw) + .filter(([key, value]) => key !== 'sellPointModel' && cleanText(key) && cleanText(String(value))) + .map(([key, value]) => ({ key: cleanText(key), value: cleanText(String(value)) })); +} + +function uniqueServices(payload: ItemBrowserPayload): BuyerProtectionModel[] { + const combined = [ + ...(Array.isArray(payload.services) ? payload.services : []), + ...(Array.isArray(payload.shipping?.protectionInfos) ? payload.shipping.protectionInfos : []), + ...(Array.isArray(payload.shipping?.buyerProtectionModel) ? payload.shipping.buyerProtectionModel : []), + ]; + + const seen = new Set(); + const result: BuyerProtectionModel[] = []; + for (const service of combined) { + const key = cleanText(service.serviceName); + if (!key || seen.has(key)) continue; + seen.add(key); + result.push(service); + } + return result; +} + +function stripAlibabaSuffix(title: string | undefined): string { + return cleanText(title).replace(/\s*-\s*阿里巴巴$/, '').trim(); +} + +function firstNonEmptyLine(text: string): string { + return text.split('\n').map((line) => cleanText(line)).find(Boolean) ?? ''; +} + +function extractMoqText(bodyText: string, beginAmount: string | number | undefined, unit: string): string { + const lineMatch = bodyText.match(/\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)\s*起批/); + if (lineMatch) return lineMatch[0]; + + const moqValue = toNumber(beginAmount); + if (moqValue !== null) { + return `${moqValue}${unit || ''}起批`; + } + + return ''; +} + +function extractDeliveryDaysText( + bodyText: string, + services: BuyerProtectionModel[], + shipping: ItemBrowserPayload['shipping'], +): string | null { + const shippingText = cleanText(shipping?.deliveryLimitText) || cleanText(shipping?.logisticsText); + if (shippingText) return shippingText; + + const textMatch = bodyText.match(/\d+\s*(?:小时|天)(?:内)?发货/); + if (textMatch) return textMatch[0]; + + const hourMatch = services.find((service) => typeof service.agreeDeliveryHours === 'number'); + if (hourMatch && typeof hourMatch.agreeDeliveryHours === 'number') { + return `${hourMatch.agreeDeliveryHours}小时内发货`; + } + + return null; +} + +function extractKeywordLine(bodyText: string, keywords: string[]): string | null { + const lines = bodyText.split('\n').map((line) => cleanText(line)).filter(Boolean); + for (const line of lines) { + if (keywords.some((keyword) => line.includes(keyword))) { + return line; + } + } + return null; +} + +function extractSalesText(bodyText: string): string | null { + const match = bodyText.match(/(?:全网销量|已售)\s*\d+(?:\.\d+)?\+?[件套个]?/); + return match ? cleanText(match[0]) : null; +} + +function extractStockQuantity(bodyText: string): number | null { + const match = bodyText.match(/库存\s*(\d+)/); + return match ? Number.parseInt(match[1], 10) : null; +} + +async function readItemPayload(page: IPage, itemUrl: string): Promise { + const state = await gotoAndReadState(page, itemUrl, 2500, 'item'); + assertAuthenticatedState(state, 'item'); + + const payload = await page.evaluate(` + (() => { + const root = window.context ?? {}; + const model = root.result?.global?.globalData?.model ?? null; + const toJson = (value) => JSON.parse(JSON.stringify(value ?? null)); + return { + href: window.location.href, + title: document.title || '', + bodyText: document.body ? document.body.innerText || '' : '', + offerTitle: model?.offerTitleModel?.subject ?? '', + offerId: model?.tradeModel?.offerId ?? '', + seller: toJson(model?.sellerModel), + trade: toJson(model?.tradeModel), + gallery: toJson(root.result?.data?.gallery?.fields ?? null), + shipping: toJson(root.result?.data?.shippingServices?.fields ?? null), + services: toJson(root.result?.data?.shippingServices?.fields?.protectionInfos ?? []), + }; + })() + `) as ItemBrowserPayload; + + const resolvedOfferId = cleanText(String(payload.offerId ?? '')) || extractOfferId(cleanText(payload.href)); + if (!resolvedOfferId) { + throw new CommandExecutionError( + '1688 item page did not expose product context', + '当前 tab 非商品详情上下文,请切到 detail.1688.com 商品页并重试', + ); + } + + return payload; +} + +cli({ + site: '1688', + name: 'item', + description: '1688 商品详情(公开商品字段、价格阶梯、卖家基础信息)', + domain: 'www.1688.com', + strategy: Strategy.COOKIE, + navigateBefore: false, + args: [ + { + name: 'input', + required: true, + positional: true, + help: '1688 商品 URL 或 offer ID(如 887904326744)', + }, + ], + columns: ['offer_id', 'title', 'price_text', 'moq_text', 'seller_name', 'origin_place'], + func: async (page, kwargs) => { + const itemUrl = buildDetailUrl(String(kwargs.input ?? '')); + const payload = await readItemPayload(page, itemUrl); + return [normalizeItemPayload(payload)]; + }, +}); + +export const __test__ = { + normalizeItemPayload, + normalizeVisibleAttributes, + stripAlibabaSuffix, + extractMoqText, + extractDeliveryDaysText, + extractKeywordLine, + extractSalesText, + extractStockQuantity, +}; diff --git a/src/clis/1688/search.test.ts b/src/clis/1688/search.test.ts new file mode 100644 index 00000000..0f277adb --- /dev/null +++ b/src/clis/1688/search.test.ts @@ -0,0 +1,67 @@ +import { describe, expect, it } from 'vitest'; +import { __test__ } from './search.js'; + +describe('1688 search normalization', () => { + it('normalizes search candidates into structured result rows', () => { + const result = __test__.normalizeSearchCandidate({ + item_url: 'https://detail.1688.com/offer/887904326744.html', + title: '宿舍置物架桌面加高架', + container_text: '宿舍置物架桌面加高架 ¥56.00 2套起批 山东青岛 已售300+套', + price_text: '¥ 56 .00', + sales_text: '300+套', + moq_text: '2套起批', + tag_items: ['退货包运费', '回头率52%'], + hover_items: ['验厂报告'], + seller_name: '青岛沁澜衣品服装有限公司', + seller_url: 'https://yinuoweierfushi.1688.com/page/index.html?spm=a123', + }, 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=置物架'); + + expect(result.rank).toBe(0); + expect(result.offer_id).toBe('887904326744'); + expect(result.shop_id).toBe('yinuoweierfushi'); + expect(result.item_url).toBe('https://detail.1688.com/offer/887904326744.html'); + expect(result.seller_url).toBe('https://yinuoweierfushi.1688.com'); + expect(result.price_text).toBe('¥56.00'); + expect(result.price_min).toBe(56); + expect(result.price_max).toBe(56); + expect(result.moq_value).toBe(2); + expect(result.location).toBe('山东青岛'); + expect(result.sales_text).toBe('300+套'); + expect(result.badges).toEqual(expect.arrayContaining(['退货包运费', '验厂报告'])); + expect(result.return_rate_text).toBe('回头率52%'); + }); + + it('extracts offer id from mobile detail search links', () => { + const result = __test__.normalizeSearchCandidate({ + item_url: 'http://detail.m.1688.com/page/index.html?offerId=910933345396&sortType=&pageId=', + title: '', + container_text: '桌面书桌办公室工位收纳展示新中式博古架多层茶具厨房摆放置物架 ¥24.3 已售20+件', + price_text: '¥ 14 .28', + sales_text: '1500+件', + moq_text: '≥2个', + seller_name: '泰商国际贸易(宁阳)有限公司', + seller_url: 'http://tsgjmy.1688.com/', + }, 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=桌面置物架'); + + expect(result.offer_id).toBe('910933345396'); + expect(result.shop_id).toBe('tsgjmy'); + expect(result.item_url).toBe('https://detail.1688.com/offer/910933345396.html'); + expect(result.title).toContain('桌面书桌办公室工位收纳展示'); + expect(result.price_text).toBe('¥14.28'); + expect(result.sales_text).toBe('1500+件'); + expect(result.moq_text).toBe('≥2个'); + expect(result.moq_value).toBe(2); + }); + + it('prefers offer id and falls back to item url for dedupe key', () => { + expect(__test__.buildDedupeKey({ + offer_id: '123456', + item_url: 'https://detail.1688.com/offer/123456.html', + })).toBe('offer:123456'); + expect(__test__.buildDedupeKey({ + offer_id: null, + item_url: 'https://detail.1688.com/offer/123456.html', + })).toBe('url:https://detail.1688.com/offer/123456.html'); + expect(__test__.buildDedupeKey({ offer_id: null, item_url: null })).toBeNull(); + }); +}); diff --git a/src/clis/1688/search.ts b/src/clis/1688/search.ts new file mode 100644 index 00000000..f3727ab5 --- /dev/null +++ b/src/clis/1688/search.ts @@ -0,0 +1,403 @@ +import { CommandExecutionError, EmptyResultError } from '../../errors.js'; +import { cli, Strategy } from '../../registry.js'; +import type { IPage } from '../../types.js'; +import { + FACTORY_BADGE_PATTERNS, + SERVICE_BADGE_PATTERNS, + assertAuthenticatedState, + buildProvenance, + buildSearchUrl, + canonicalizeItemUrl, + canonicalizeSellerUrl, + cleanText, + extractBadges, + extractLocation, + extractMemberId, + extractOfferId, + extractShopId, + gotoAndReadState, + parseMoqText, + parsePriceText, + SEARCH_LIMIT_DEFAULT, + SEARCH_LIMIT_MAX, + parseSearchLimit, + uniqueNonEmpty, +} from './shared.js'; + +interface SearchPayload { + href?: string; + title?: string; + bodyText?: string; + next_url?: string; + candidates?: Array<{ + item_url?: string; + title?: string; + container_text?: string; + desc_rows?: string[]; + price_text?: string | null; + sales_text?: string | null; + hover_price_text?: string | null; + moq_text?: string | null; + tag_items?: string[]; + hover_items?: string[]; + seller_name?: string | null; + seller_url?: string | null; + }>; +} + +interface SearchRow { + rank: number; + offer_id: string | null; + member_id: string | null; + shop_id: string | null; + title: string | null; + item_url: string | null; + seller_name: string | null; + seller_url: string | null; + price_text: string | null; + price_min: number | null; + price_max: number | null; + currency: string | null; + moq_text: string | null; + moq_value: number | null; + location: string | null; + badges: string[]; + sales_text: string | null; + return_rate_text: string | null; + source_url: string; + fetched_at: string; + strategy: string; +} + +const SEARCH_ITEM_URL_PATTERNS = [ + 'detail.1688.com/offer/', + 'detail.m.1688.com/page/index.html?offerId=', +]; +const MAX_SEARCH_PAGES = 12; + +function normalizeSearchCandidate( + candidate: NonNullable[number], + sourceUrl: string, +): SearchRow { + const canonicalItemUrl = canonicalizeItemUrl(cleanText(candidate.item_url)); + const containerText = cleanText(candidate.container_text); + const priceText = firstNonEmpty([ + normalizeInlineText(candidate.price_text), + normalizeInlineText(extractPriceText(candidate.hover_price_text)), + ]); + const priceRange = parsePriceText(priceText || containerText); + const moq = parseMoqText(firstNonEmpty([ + normalizeInlineText(candidate.moq_text), + normalizeInlineText(extractMoqText(candidate.hover_price_text)), + normalizeInlineText(extractMoqText(containerText)), + ])); + const canonicalSellerUrl = canonicalizeSellerUrl(cleanText(candidate.seller_url)); + const evidenceText = uniqueNonEmpty([ + containerText, + ...(candidate.desc_rows ?? []), + ...(candidate.tag_items ?? []), + ...(candidate.hover_items ?? []), + ]).join('\n'); + const badges = extractBadges(evidenceText, [...FACTORY_BADGE_PATTERNS, ...SERVICE_BADGE_PATTERNS]); + const salesText = firstNonEmpty([ + extractSalesText(candidate.sales_text), + extractSalesText(containerText), + ]); + const returnRateText = extractReturnRateText([...(candidate.tag_items ?? []), ...(candidate.hover_items ?? [])]); + const provenance = buildProvenance(sourceUrl); + + return { + rank: 0, + offer_id: extractOfferId(canonicalItemUrl ?? '') ?? null, + member_id: extractMemberId(canonicalSellerUrl ?? '') ?? null, + shop_id: extractShopId(canonicalSellerUrl ?? '') ?? null, + title: cleanText(candidate.title) || firstLine(containerText) || null, + item_url: canonicalItemUrl, + seller_name: cleanText(candidate.seller_name) || null, + seller_url: canonicalSellerUrl, + price_text: priceRange.price_text || null, + price_min: priceRange.price_min, + price_max: priceRange.price_max, + currency: priceRange.currency, + moq_text: moq.moq_text || null, + moq_value: moq.moq_value, + location: extractLocation(containerText), + badges, + sales_text: salesText || null, + return_rate_text: returnRateText, + source_url: provenance.source_url, + fetched_at: provenance.fetched_at, + strategy: provenance.strategy, + }; +} + +function extractMoqText(text: string | null | undefined): string { + const normalized = normalizeInlineText(text); + return normalized.match(/\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)\s*起批/i)?.[0] + ?? normalized.match(/≥\s*\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)?/i)?.[0] + ?? normalized.match(/\d+(?:\.\d+)?\s*(?:~|-|至|到)\s*\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)/i)?.[0] + ?? ''; +} + +function extractPriceText(text: string | null | undefined): string { + const normalized = normalizeInlineText(text); + return normalized.match(/[¥$€]\s*\d+(?:\.\d+)?/)?.[0] ?? ''; +} + +function extractSalesText(text: string | null | undefined): string { + const normalized = normalizeInlineText(text); + if (!normalized) return ''; + if (/^\d+(?:\.\d+)?\+?\s*(件|套|个|单)$/.test(normalized)) { + return normalized; + } + const match = normalized.match(/(?:已售|销量|售)\s*\d+(?:\.\d+)?\+?\s*(件|套|个|单)?/); + return match ? cleanText(match[0]) : ''; +} + +function firstLine(text: string): string { + return text.split(/\s+/).find(Boolean) ?? ''; +} + +function firstNonEmpty(values: Array): string { + return values.map((value) => cleanText(value)).find(Boolean) ?? ''; +} + +function normalizeInlineText(text: string | null | undefined): string { + return cleanText(text) + .replace(/([¥$€])\s+(?=\d)/g, '$1') + .replace(/(\d)\s*\.\s*(\d)/g, '$1.$2') + .replace(/\s*([~-])\s*/g, '$1') + .trim(); +} + +function extractReturnRateText(values: string[]): string | null { + return uniqueNonEmpty(values.map((value) => normalizeInlineText(value))) + .find((value) => /^回头率\s*\d+(?:\.\d+)?%$/.test(value)) + ?? null; +} + +function buildDedupeKey(row: Pick): string | null { + if (row.offer_id) return `offer:${row.offer_id}`; + if (row.item_url) return `url:${row.item_url}`; + return null; +} + +async function readSearchPayload(page: IPage, url: string): Promise { + const state = await gotoAndReadState(page, url, 2500, 'search'); + assertAuthenticatedState(state, 'search'); + + const payload = await page.evaluate(` + (() => { + const normalizeText = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + const normalizeUrl = (href) => { + if (!href) return ''; + try { + return new URL(href, window.location.href).toString(); + } catch { + return ''; + } + }; + const isItemHref = (href) => ${JSON.stringify(SEARCH_ITEM_URL_PATTERNS)} + .some((pattern) => (href || '').includes(pattern)); + const uniqueTexts = (values) => [...new Set(values.map((value) => normalizeText(value)).filter(Boolean))]; + const collectTexts = (root, selector) => uniqueTexts( + Array.from(root.querySelectorAll(selector)).map((node) => node.innerText || node.textContent || ''), + ); + const firstText = (root, selectors) => { + for (const selector of selectors) { + const node = root.querySelector(selector); + const value = normalizeText(node ? node.innerText || node.textContent || '' : ''); + if (value) return value; + } + return ''; + }; + const findMoqText = (values, priceText) => { + const moqPattern = /(≥\\s*\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只)?)|(\\d+(?:\\.\\d+)?\\s*(?:~|-|至|到)\\s*\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只))|(\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只)\\s*起批)/i; + return values.find((value) => moqPattern.test(value)) + || normalizeText(priceText).match(moqPattern)?.[0] + || ''; + }; + const isSellerHref = (href) => { + if (!href) return false; + try { + const url = new URL(href, window.location.href); + const host = url.hostname || ''; + if (!host.endsWith('.1688.com')) return false; + if ( + host === 's.1688.com' + || host === 'r.1688.com' + || host === 'air.1688.com' + || host === 'detail.1688.com' + || host === 'detail.m.1688.com' + || host === 'dj.1688.com' + ) { + return false; + } + return true; + } catch { + return false; + } + }; + const pickContainer = (anchor) => { + let node = anchor; + while (node && node !== document.body) { + const text = normalizeText(node.innerText || node.textContent || ''); + if (text.length >= 40 && text.length <= 2000) { + return node; + } + node = node.parentElement; + } + return anchor; + }; + const collectCandidates = () => { + const anchors = Array.from(document.querySelectorAll('a')).filter((anchor) => isItemHref(anchor.href || '')); + const seen = new Set(); + const items = []; + for (const anchor of anchors) { + const href = anchor.href || ''; + if (!href || seen.has(href)) continue; + seen.add(href); + + const container = pickContainer(anchor); + const tagItems = collectTexts(container, '.offer-tag-row .offer-desc-item'); + const hoverItems = collectTexts(container, '.offer-hover-wrapper .offer-desc-item'); + const sellerAnchor = Array.from(container.querySelectorAll('a')) + .find((link) => isSellerHref(link.href || '')); + const hoverPriceText = firstText(container, [ + '.offer-hover-wrapper .hover-price-item', + '.offer-hover-wrapper .price-item', + ]); + + items.push({ + item_url: href, + title: firstText(container, ['.offer-title-row .title-text', '.offer-title-row']) + || normalizeText(anchor.innerText || anchor.textContent || ''), + container_text: normalizeText(container.innerText || container.textContent || ''), + desc_rows: collectTexts(container, '.offer-desc-row'), + price_text: firstText(container, ['.offer-price-row .price-item']), + sales_text: firstText(container, ['.offer-price-row .col-desc_after', '.offer-desc-row .col-desc_after']), + hover_price_text: hoverPriceText, + moq_text: findMoqText(hoverItems, hoverPriceText), + tag_items: tagItems, + hover_items: hoverItems, + seller_name: sellerAnchor ? normalizeText(sellerAnchor.innerText || sellerAnchor.textContent || '') : null, + seller_url: sellerAnchor ? sellerAnchor.href : null, + }); + } + return items; + }; + const findNextUrl = () => { + const selectors = [ + 'a.fui-next:not(.disabled)', + 'a.next-pagination-item:not(.disabled)', + 'a[rel="next"]:not(.disabled)', + 'a[data-role="next"]:not(.disabled)', + ]; + for (const selector of selectors) { + const node = document.querySelector(selector); + if (!node) continue; + const href = normalizeUrl(node.getAttribute('href') || node.href || ''); + if (href) return href; + } + const textBased = Array.from(document.querySelectorAll('a')) + .find((node) => /下一页|next/i.test(normalizeText(node.textContent || ''))); + if (!textBased) return ''; + return normalizeUrl(textBased.getAttribute('href') || textBased.href || ''); + }; + + return { + href: window.location.href, + title: document.title || '', + bodyText: document.body ? document.body.innerText || '' : '', + next_url: findNextUrl(), + candidates: collectCandidates(), + }; + })() + `) as SearchPayload; + + if (!payload || typeof payload !== 'object') { + throw new CommandExecutionError( + '1688 search page did not return a readable payload', + 'Open the same query in Chrome and verify the page is fully loaded before retrying.', + ); + } + + return payload; +} + +async function collectSearchRows(page: IPage, query: string, limit: number): Promise { + const rowsByKey = new Map(); + const seenPages = new Set(); + let nextUrl = buildSearchUrl(query); + let pageCount = 0; + + while (nextUrl && rowsByKey.size < limit && pageCount < MAX_SEARCH_PAGES) { + if (seenPages.has(nextUrl)) break; + seenPages.add(nextUrl); + pageCount += 1; + + const payload = await readSearchPayload(page, nextUrl); + const sourceUrl = cleanText(payload.href) || nextUrl; + const candidates = Array.isArray(payload.candidates) ? payload.candidates : []; + + for (const candidate of candidates) { + const row = normalizeSearchCandidate(candidate, sourceUrl); + const dedupeKey = buildDedupeKey(row); + if (!dedupeKey || rowsByKey.has(dedupeKey)) continue; + rowsByKey.set(dedupeKey, row); + if (rowsByKey.size >= limit) break; + } + + const candidateNextUrl = cleanText(payload.next_url); + if (!candidateNextUrl || candidateNextUrl === sourceUrl) break; + nextUrl = candidateNextUrl; + } + + if (rowsByKey.size === 0) { + throw new EmptyResultError( + '1688 search', + 'No visible results were extracted. Retry with a different query or open the same search page in Chrome first.', + ); + } + + return [...rowsByKey.values()] + .slice(0, limit) + .map((row, index) => ({ ...row, rank: index + 1 })); +} + +cli({ + site: '1688', + name: 'search', + description: '1688 商品搜索(结果候选、卖家链接、价格/MOQ/销量文本)', + domain: 'www.1688.com', + strategy: Strategy.COOKIE, + navigateBefore: false, + args: [ + { + name: 'query', + required: true, + positional: true, + help: '搜索关键词,如 "置物架"', + }, + { + name: 'limit', + type: 'int', + default: SEARCH_LIMIT_DEFAULT, + help: `结果数量上限(默认 ${SEARCH_LIMIT_DEFAULT},最大 ${SEARCH_LIMIT_MAX})`, + }, + ], + columns: ['rank', 'title', 'price_text', 'moq_text', 'seller_name', 'location'], + func: async (page, kwargs) => { + const query = String(kwargs.query ?? ''); + const limit = parseSearchLimit(kwargs.limit); + return collectSearchRows(page, query, limit); + }, +}); + +export const __test__ = { + normalizeSearchCandidate, + extractMoqText, + extractSalesText, + firstLine, + buildDedupeKey, +}; diff --git a/src/clis/1688/shared.test.ts b/src/clis/1688/shared.test.ts new file mode 100644 index 00000000..e0cfdbbd --- /dev/null +++ b/src/clis/1688/shared.test.ts @@ -0,0 +1,75 @@ +import { describe, expect, it } from 'vitest'; +import { __test__ } from './shared.js'; + +describe('1688 shared helpers', () => { + it('builds encoded search URLs and validates limit', () => { + expect(__test__.buildSearchUrl('置物架')).toBe( + 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords=%E7%BD%AE%E7%89%A9%E6%9E%B6', + ); + expect(() => __test__.buildSearchUrl(' ')).toThrowError(/cannot be empty/i); + + expect(__test__.parseSearchLimit(3)).toBe(3); + expect(__test__.parseSearchLimit('1000')).toBe(__test__.SEARCH_LIMIT_MAX); + expect(() => __test__.parseSearchLimit('0')).toThrowError(/positive integer/i); + }); + + it('extracts IDs and canonicalizes urls', () => { + expect(__test__.extractOfferId('887904326744')).toBe('887904326744'); + expect(__test__.extractOfferId('https://detail.1688.com/offer/887904326744.html')).toBe('887904326744'); + expect(__test__.extractMemberId('https://winport.m.1688.com/page/index.html?memberId=b2b-1641351767')).toBe('b2b-1641351767'); + expect(__test__.extractMemberId('b2b-22154705262941f196')).toBe('b2b-22154705262941f196'); + expect(__test__.resolveStoreUrl('b2b-22154705262941f196')).toBe( + 'https://winport.m.1688.com/page/index.html?memberId=b2b-22154705262941f196', + ); + expect(__test__.canonicalizeStoreUrl('https://yinuoweierfushi.1688.com/page/index.html?spm=foo')).toBe( + 'https://yinuoweierfushi.1688.com', + ); + expect(__test__.canonicalizeItemUrl('http://detail.m.1688.com/page/index.html?offerId=910933345396&spm=x')).toBe( + 'https://detail.1688.com/offer/910933345396.html', + ); + expect(__test__.canonicalizeSellerUrl('https://yinuoweierfushi.1688.com/page/contactinfo.html?tracelog=1')).toBe( + 'https://yinuoweierfushi.1688.com', + ); + expect(__test__.extractShopId('https://yinuoweierfushi.1688.com/page/index.html')).toBe('yinuoweierfushi'); + }); + + it('parses price ranges and moq text', () => { + expect(__test__.parsePriceText('¥96.00-98.00')).toEqual({ + price_text: '¥96.00-98.00', + price_min: 96, + price_max: 98, + currency: 'CNY', + }); + + expect(__test__.parsePriceText('¥ 14 .28')).toEqual({ + price_text: '¥14.28', + price_min: 14.28, + price_max: 14.28, + currency: 'CNY', + }); + + expect(__test__.parseMoqText('3套起批')).toEqual({ + moq_text: '3套起批', + moq_value: 3, + }); + + expect(__test__.parseMoqText('2~999个')).toEqual({ + moq_text: '2~999个', + moq_value: 2, + }); + }); + + it('detects captcha and login states', () => { + expect(__test__.extractLocation('山东青岛 送至 江苏苏州')).toBe('山东青岛'); + expect(__test__.isCaptchaState({ + href: 'https://s.1688.com/_____tmd_____/punish', + title: '验证码拦截', + body_text: '请拖动下方滑块完成验证', + })).toBe(true); + expect(__test__.isLoginState({ + href: 'https://login.taobao.com/member/login.jhtml', + title: '账号登录', + body_text: '请登录后继续', + })).toBe(true); + }); +}); diff --git a/src/clis/1688/shared.ts b/src/clis/1688/shared.ts new file mode 100644 index 00000000..6fcfb77f --- /dev/null +++ b/src/clis/1688/shared.ts @@ -0,0 +1,623 @@ +import { ArgumentError, AuthRequiredError, CommandExecutionError } from '../../errors.js'; +import type { IPage } from '../../types.js'; + +export const SITE = '1688'; +export const HOME_URL = 'https://www.1688.com/'; +export const SEARCH_URL_PREFIX = 'https://s.1688.com/selloffer/offer_search.htm?charset=utf8&keywords='; +export const DETAIL_URL_PREFIX = 'https://detail.1688.com/offer/'; +export const STORE_MOBILE_URL_PREFIX = 'https://winport.m.1688.com/page/index.html?memberId='; +export const STRATEGY = 'cookie'; +export const SEARCH_LIMIT_DEFAULT = 20; +export const SEARCH_LIMIT_MAX = 100; + +const STORE_GENERIC_HOSTS = new Set(['www', 'detail', 's', 'winport', 'work', 'air', 'dj']); +const TRACKING_QUERY_KEYS = new Set([ + 'spm', + 'tracelog', + 'clickid', + 'source', + 'scene', + 'from', + 'src', + 'ns', + 'cna', + 'pvid', +]); +const CAPTCHA_URL_MARKER = '/_____tmd_____/punish'; +const CAPTCHA_TEXT_PATTERNS = [ + '请拖动下方滑块完成验证', + '请按住滑块,拖动到最右边', + '通过验证以确保正常访问', + '验证码拦截', + '访问验证', + '滑动验证', +]; +const LOGIN_TEXT_PATTERNS = [ + '请登录', + '登录后', + '账号登录', + '手机登录', + '立即登录', + '扫码登录', + '请先完成登录', + '请先登录后查看', +]; +const LOGIN_URL_PATTERNS = ['/member/login', 'passport', 'login.taobao.com', 'account.1688.com']; + +export const FACTORY_BADGE_PATTERNS = [ + '源头工厂', + '深度验厂', + '实力工厂', + '工厂档案', + '加工专区', + '验厂报告', + '厂家直销', + '生产厂家', + '工厂直供', +]; +export const SERVICE_BADGE_PATTERNS = [ + '延期必赔', + '品质保障', + '破损包赔', + '退货包运费', + '晚发必赔', + '7*24小时响应', + '48小时发货', + '72小时发货', + '后天达', + '包邮', + '闪电拿样', +]; + +const CHINA_LOCATIONS = [ + '北京', + '天津', + '上海', + '重庆', + '河北', + '山西', + '辽宁', + '吉林', + '黑龙江', + '江苏', + '浙江', + '安徽', + '福建', + '江西', + '山东', + '河南', + '湖北', + '湖南', + '广东', + '海南', + '四川', + '贵州', + '云南', + '陕西', + '甘肃', + '青海', + '台湾', + '内蒙古', + '广西', + '西藏', + '宁夏', + '新疆', + '香港', + '澳门', +]; + +export interface ProvenanceFields { + source_url: string; + fetched_at: string; + strategy: string; +} + +export interface PageState { + href: string; + title: string; + body_text: string; +} + +export interface PriceRange { + price_text: string; + price_min: number | null; + price_max: number | null; + currency: string | null; +} + +export interface MoqValue { + moq_text: string; + moq_value: number | null; +} + +export interface PriceTier { + quantity_text: string; + quantity_min: number | null; + price_text: string; + price: number | null; + currency: string | null; +} + +export interface SearchCandidate { + item_url: string; + title: string; + container_text: string; + seller_name: string | null; + seller_url: string | null; +} + +export function cleanText(value: unknown): string { + return typeof value === 'string' + ? value.replace(/\u00a0/g, ' ').replace(/\s+/g, ' ').trim() + : ''; +} + +export function cleanMultilineText(value: unknown): string { + return typeof value === 'string' + ? value + .replace(/\u00a0/g, ' ') + .split('\n') + .map((line) => line.replace(/\s+/g, ' ').trim()) + .filter(Boolean) + .join('\n') + : ''; +} + +export function uniqueNonEmpty(values: Array): string[] { + return [...new Set(values.map((value) => cleanText(value)).filter(Boolean))]; +} + +export function parseSearchLimit(input: unknown): number { + const parsed = Number.parseInt(String(input ?? SEARCH_LIMIT_DEFAULT), 10); + if (!Number.isFinite(parsed) || parsed < 1) { + throw new ArgumentError( + '1688 search --limit must be a positive integer', + 'Example: opencli 1688 search "桌面置物架" --limit 20', + ); + } + return Math.min(SEARCH_LIMIT_MAX, parsed); +} + +export function buildSearchUrl(query: string): string { + const normalized = cleanText(query); + if (!normalized) { + throw new ArgumentError( + '1688 search query cannot be empty', + 'Example: opencli 1688 search "桌面置物架" --limit 20', + ); + } + return `${SEARCH_URL_PREFIX}${encodeURIComponent(normalized)}`; +} + +export function buildDetailUrl(input: string): string { + const offerId = extractOfferId(input); + if (!offerId) { + throw new ArgumentError( + '1688 item expects an offer URL or offer ID', + 'Example: opencli 1688 item 887904326744', + ); + } + return `${DETAIL_URL_PREFIX}${offerId}.html`; +} + +export function resolveStoreUrl(input: string): string { + const normalized = cleanText(input); + if (!normalized) { + throw new ArgumentError( + '1688 store expects a store URL or member ID', + 'Example: opencli 1688 store https://yinuoweierfushi.1688.com/', + ); + } + + const memberId = extractMemberId(normalized); + if (memberId) { + return `${STORE_MOBILE_URL_PREFIX}${memberId}`; + } + + if (/^https?:\/\//i.test(normalized)) { + return canonicalizeStoreUrl(normalized); + } + + if (normalized.endsWith('.1688.com')) { + return canonicalizeStoreUrl(`https://${normalized}`); + } + + if (/^[a-z0-9-]+$/i.test(normalized)) { + return canonicalizeStoreUrl(`https://${normalized}.1688.com`); + } + + throw new ArgumentError( + '1688 store expects a store URL or member ID', + 'Example: opencli 1688 store b2b-22154705262941f196', + ); +} + +export function canonicalizeStoreUrl(input: string): string { + const url = parse1688Url(input); + const memberId = extractMemberId(url.toString()); + if (memberId) { + return `${STORE_MOBILE_URL_PREFIX}${memberId}`; + } + + const host = normalizeStoreHost(url.hostname); + if (!host) { + throw new ArgumentError( + 'Invalid 1688 store URL', + 'Example: opencli 1688 store https://yinuoweierfushi.1688.com/', + ); + } + return `https://${host}`; +} + +export function canonicalizeItemUrl(input: string): string | null { + const offerId = extractOfferId(input); + if (offerId) { + return `${DETAIL_URL_PREFIX}${offerId}.html`; + } + const url = parse1688UrlOrNull(input); + if (!url) return null; + stripTrackingParams(url); + url.hash = ''; + return url.toString(); +} + +export function canonicalizeSellerUrl(input: string): string | null { + const memberId = extractMemberId(input); + if (memberId) { + return `${STORE_MOBILE_URL_PREFIX}${memberId}`; + } + const url = parse1688UrlOrNull(input); + if (!url) return null; + const host = normalizeStoreHost(url.hostname); + if (!host) return null; + return `https://${host}`; +} + +export function extractOfferId(input: string): string | null { + const normalized = cleanText(input); + if (!normalized) return null; + const directId = normalized.match(/^\d{6,}$/)?.[0]; + if (directId) return directId; + const detailMatch = normalized.match(/\/offer\/(\d{6,})\.html/i); + if (detailMatch) return detailMatch[1]; + const queryMatch = normalized.match(/[?&]offerId=(\d{6,})/i); + if (queryMatch) return queryMatch[1]; + return null; +} + +export function extractMemberId(input: string): string | null { + const normalized = cleanText(input); + if (!normalized) return null; + const direct = normalized.match(/\bb2b-[a-z0-9]+\b/i)?.[0]; + if (direct) return direct; + const queryMatch = normalized.match(/[?&]memberId=(b2b-[a-z0-9]+)/i); + if (queryMatch) return queryMatch[1]; + const mobileMatch = normalized.match(/\/winport\/(b2b-[a-z0-9]+)\.html/i); + if (mobileMatch) return mobileMatch[1]; + return null; +} + +export function extractShopId(input: string): string | null { + const normalized = cleanText(input); + if (!normalized) return null; + + try { + const url = new URL(/^https?:\/\//i.test(normalized) ? normalized : `https://${normalized}`); + const host = normalizeStoreHost(url.hostname); + if (!host) return null; + return host.split('.')[0] ?? null; + } catch { + return /^[a-z0-9-]+$/i.test(normalized) ? normalized : null; + } +} + +export function buildProvenance(sourceUrl: string): ProvenanceFields { + return { + source_url: sourceUrl, + fetched_at: new Date().toISOString(), + strategy: STRATEGY, + }; +} + +export function parsePriceText(text: string): PriceRange { + const normalized = normalizeNumericText(cleanText(text)); + const matches = normalized.match(/\d+(?:,\d{3})*(?:\.\d+)?/g) ?? []; + const values = matches + .map((value) => Number.parseFloat(value.replace(/,/g, ''))) + .filter((value) => Number.isFinite(value)); + + if (values.length === 0) { + return { + price_text: normalized, + price_min: null, + price_max: null, + currency: null, + }; + } + + return { + price_text: normalized, + price_min: values[0] ?? null, + price_max: values[values.length - 1] ?? values[0] ?? null, + currency: normalized.includes('¥') || normalized.includes('元') ? 'CNY' : null, + }; +} + +export function normalizePriceTiers( + rawTiers: Array<{ beginAmount?: unknown; price?: unknown }>, + unit: string | null, +): PriceTier[] { + return rawTiers + .map((tier) => { + const quantityMin = toNumber(tier.beginAmount); + const priceText = cleanText(tier.price); + const price = toNumber(tier.price); + return { + quantity_text: quantityMin !== null ? `${quantityMin}${unit ?? ''}` : '', + quantity_min: quantityMin, + price_text: priceText, + price, + currency: priceText ? 'CNY' : null, + }; + }) + .filter((tier) => tier.price_text); +} + +export function parseMoqText(text: string): MoqValue { + const normalized = normalizeNumericText(cleanText(text)); + const match = normalized.match(/(\d+(?:\.\d+)?)\s*(件|个|套|箱|包|双|台|把|只|pcs|piece|pieces)?\s*起批/i) + ?? normalized.match(/≥\s*(\d+(?:\.\d+)?)/); + const rangeMatch = normalized.match( + /(\d+(?:\.\d+)?)\s*(?:~|-|至|到)\s*\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只|pcs|piece|pieces)/i, + ); + + if (!match && !rangeMatch) { + return { + moq_text: normalized, + moq_value: null, + }; + } + + return { + moq_text: normalized, + moq_value: Number.parseFloat((match ?? rangeMatch)![1]), + }; +} + +export function extractLocation(text: string): string | null { + const normalized = cleanMultilineText(text); + const primaryRegion = normalized.split(/送至|发往/)[0] ?? normalized; + const lines = primaryRegion.split('\n'); + for (const line of lines) { + const compact = cleanText(line); + if (!compact || compact.length > 16) continue; + if (CHINA_LOCATIONS.some((location) => compact.startsWith(location))) { + return compact; + } + } + + const locationPattern = new RegExp(`(${CHINA_LOCATIONS.join('|')})[\\u4e00-\\u9fa5]{0,8}`); + return primaryRegion.match(locationPattern)?.[0] ?? null; +} + +export function extractAddress(text: string): string | null { + const normalized = cleanMultilineText(text); + const lineMatch = normalized.match(/地址[::]\s*([^\n]+)/); + if (lineMatch) return cleanText(lineMatch[1]); + return normalized + .split('\n') + .map((line) => cleanText(line)) + .find((line) => line.includes('省') || line.includes('市') || line.includes('区') || line.includes('县')) + ?? null; +} + +export function extractMetric(text: string, label: string): string | null { + const normalized = cleanMultilineText(text); + const direct = normalized.match(new RegExp(`(?:^|\\n)\\s*${escapeForRegex(label)}[::]?\\s*([^\\n]+)`)); + if (direct) return cleanText(direct[1]); + + const lineBased = normalized.match(new RegExp(`(?:^|\\n)\\s*${escapeForRegex(label)}\\n([^\\n]+)`)); + return lineBased ? cleanText(lineBased[1]) : null; +} + +export function extractYearsOnPlatform(text: string): string | null { + return text.match(/入驻\d+年/)?.[0] ?? null; +} + +export function extractMainBusiness(text: string): string | null { + const value = extractMetric(text, '主营'); + return value ? value.replace(/^:/, '').trim() : null; +} + +export function extractBadges(text: string, candidates: string[]): string[] { + return uniqueNonEmpty(candidates.filter((candidate) => cleanMultilineText(text).includes(candidate))); +} + +export function guessTopCategories(text: string): string[] { + const mainBusiness = extractMainBusiness(text); + if (!mainBusiness) return []; + return uniqueNonEmpty(mainBusiness.split(/[、,/|]/).map((value) => value.trim())); +} + +export function isCaptchaState(state: Partial): boolean { + const href = cleanText(state.href).toLowerCase(); + const title = cleanText(state.title); + const bodyText = cleanMultilineText(state.body_text); + if (href.includes(CAPTCHA_URL_MARKER)) return true; + return CAPTCHA_TEXT_PATTERNS.some((pattern) => title.includes(pattern) || bodyText.includes(pattern)); +} + +export function isLoginState(state: Partial): boolean { + const href = cleanText(state.href).toLowerCase(); + const title = cleanText(state.title); + const bodyText = cleanMultilineText(state.body_text); + if (LOGIN_URL_PATTERNS.some((pattern) => href.includes(pattern))) return true; + return LOGIN_TEXT_PATTERNS.some((pattern) => title.includes(pattern) || bodyText.includes(pattern)); +} + +export function buildCaptchaHint(action: string): string { + return [ + `Open a clean 1688 ${action} page in the shared Chrome profile and finish any slider challenge first.`, + 'If you run opencli via CDP, set OPENCLI_CDP_TARGET=1688.com or a more specific 1688 host before retrying.', + ].join(' '); +} + +export async function readPageState(page: IPage): Promise { + const result = await page.evaluate(` + (() => ({ + href: window.location.href, + title: document.title || '', + body_text: document.body ? document.body.innerText || '' : '', + }))() + `) as Partial; + + return { + href: cleanText(result.href), + title: cleanText(result.title), + body_text: cleanMultilineText(result.body_text), + }; +} + +export async function gotoAndReadState( + page: IPage, + url: string, + settleMs: number = 2500, + action: string = 'page', +): Promise { + try { + await page.goto(url, { settleMs }); + await page.wait(1.5); + return readPageState(page); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if ( + message.includes('Inspected target navigated or closed') + || message.includes('Cannot find context with specified id') + || message.includes('Target closed') + ) { + throw new CommandExecutionError( + `1688 ${action} navigation lost the current browser target`, + `${buildCaptchaHint(action)} If CDP is attached to a stale or blocked tab, open a fresh 1688 tab and point OPENCLI_CDP_TARGET at that tab.`, + ); + } + throw error; + } +} + +export async function ensure1688Session(page: IPage): Promise { + const state = await gotoAndReadState(page, HOME_URL, 1500, 'homepage'); + assertAuthenticatedState(state, 'homepage'); +} + +export function assertAuthenticatedState(state: PageState, action: string): void { + if (!isCaptchaState(state) && !isLoginState(state)) return; + throw new AuthRequiredError('1688.com', `请先在共享 Chrome 完成 1688 登录/验证,再重试(${action})`); +} + +export function assertNotCaptcha(state: PageState, action: string): void { + assertAuthenticatedState(state, action); +} + +export function toNumber(value: unknown): number | null { + if (typeof value === 'number' && Number.isFinite(value)) { + return value; + } + if (typeof value === 'string') { + const normalized = value.replace(/,/g, '').trim(); + if (!normalized) return null; + const parsed = Number.parseFloat(normalized); + return Number.isFinite(parsed) ? parsed : null; + } + return null; +} + +export function limitCandidates(values: T[], limit: number): T[] { + const normalizedLimit = Math.max(1, Math.trunc(limit) || 1); + return values.slice(0, normalizedLimit); +} + +function normalizeNumericText(value: string): string { + return value + .replace(/([¥$€])\s+(?=\d)/g, '$1') + .replace(/(\d)\s*\.\s*(\d)/g, '$1.$2') + .replace(/\s*([~-])\s*/g, '$1') + .trim(); +} + +function escapeForRegex(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +function parse1688Url(input: string): URL { + const normalized = cleanText(input); + try { + const url = new URL(normalized); + if (!url.hostname.endsWith('.1688.com') && url.hostname !== '1688.com' && url.hostname !== 'www.1688.com') { + throw new Error('invalid-host'); + } + stripTrackingParams(url); + url.hash = ''; + return url; + } catch { + throw new ArgumentError( + 'Invalid 1688 URL', + 'Use a URL under 1688.com (for example: https://detail.1688.com/offer/887904326744.html)', + ); + } +} + +function parse1688UrlOrNull(input: string): URL | null { + try { + return parse1688Url(input); + } catch { + return null; + } +} + +function normalizeStoreHost(hostname: string): string | null { + const lower = cleanText(hostname).toLowerCase(); + if (!lower.endsWith('.1688.com')) return null; + const [subdomain] = lower.split('.'); + if (!subdomain || STORE_GENERIC_HOSTS.has(subdomain)) return null; + return lower; +} + +function stripTrackingParams(url: URL): void { + const keys = [...url.searchParams.keys()]; + for (const key of keys) { + if (TRACKING_QUERY_KEYS.has(key) || key.toLowerCase().startsWith('utm_')) { + url.searchParams.delete(key); + } + } +} + +export const __test__ = { + SEARCH_LIMIT_DEFAULT, + SEARCH_LIMIT_MAX, + parseSearchLimit, + buildSearchUrl, + buildDetailUrl, + resolveStoreUrl, + canonicalizeStoreUrl, + canonicalizeItemUrl, + canonicalizeSellerUrl, + extractOfferId, + extractMemberId, + extractShopId, + parsePriceText, + normalizePriceTiers, + parseMoqText, + extractLocation, + extractAddress, + extractMetric, + extractYearsOnPlatform, + extractMainBusiness, + extractBadges, + guessTopCategories, + isCaptchaState, + isLoginState, + cleanText, + cleanMultilineText, + uniqueNonEmpty, + limitCandidates, +}; diff --git a/src/clis/1688/store.test.ts b/src/clis/1688/store.test.ts new file mode 100644 index 00000000..6da43135 --- /dev/null +++ b/src/clis/1688/store.test.ts @@ -0,0 +1,69 @@ +import { describe, expect, it } from 'vitest'; +import { __test__ } from './store.js'; + +describe('1688 store normalization', () => { + it('merges store contact text with seller seed data', () => { + const result = __test__.normalizeStorePayload({ + resolvedUrl: 'https://yinuoweierfushi.1688.com/?offerId=887904326744', + explicitMemberId: null, + storePayload: { + href: 'https://yinuoweierfushi.1688.com/page/index.html', + bodyText: ` + 青岛沁澜衣品服装有限公司 + 联系方式 + 地址:山东省青岛市即墨区环秀街道办事处湘江二路97号甲 + `, + offerLinks: ['https://detail.1688.com/offer/887904326744.html'], + }, + contactPayload: { + href: 'https://yinuoweierfushi.1688.com/page/contactinfo.html', + bodyText: ` + 青岛沁澜衣品服装有限公司 + 电话:86 0532 86655366 + 手机:15963238678 + 地址:山东省青岛市即墨区环秀街道办事处湘江二路97号甲 + `, + }, + seed: { + bodyText: ` + 入驻13年 + 主营:大码女装 + 店铺回头率 + 87% + 延期必赔 + 品质保障 + `, + seller: { + companyName: '青岛沁澜衣品服装有限公司', + memberId: 'b2b-1641351767', + winportUrl: 'https://yinuoweierfushi.1688.com/page/index.html?spm=abc', + }, + services: [{ serviceName: '延期必赔' }, { serviceName: '品质保障' }], + }, + }); + + expect(result.member_id).toBe('b2b-1641351767'); + expect(result.store_url).toBe('https://yinuoweierfushi.1688.com'); + expect(result.company_url).toBe('https://yinuoweierfushi.1688.com/page/contactinfo.html'); + expect(result.years_on_platform_text).toBe('入驻13年'); + expect(result.location).toBe('山东省青岛市即墨区环秀街道办事处湘江二路97号甲'); + expect(result.return_rate_text).toContain('87%'); + expect(result.top_categories).toEqual(['大码女装']); + expect(result.service_badges).toEqual(['延期必赔', '品质保障']); + }); + + it('builds contact urls and extracts offer ids', () => { + expect(__test__.safeCanonicalStoreUrl('https://yinuoweierfushi.1688.com/page/index.html?spm=foo')).toBe( + 'https://yinuoweierfushi.1688.com', + ); + expect(__test__.buildContactUrl('https://yinuoweierfushi.1688.com')).toBe( + 'https://yinuoweierfushi.1688.com/page/contactinfo.html', + ); + expect(__test__.firstOfferId([ + 'https://detail.1688.com/offer/887904326744.html', + ])).toBe('887904326744'); + expect(__test__.firstContactUrl([ + 'https://yinuoweierfushi.1688.com/page/contactinfo.html?spm=1', + ])).toBe('https://yinuoweierfushi.1688.com/page/contactinfo.html'); + }); +}); diff --git a/src/clis/1688/store.ts b/src/clis/1688/store.ts new file mode 100644 index 00000000..ce21b637 --- /dev/null +++ b/src/clis/1688/store.ts @@ -0,0 +1,300 @@ +import { CommandExecutionError, EmptyResultError } from '../../errors.js'; +import { cli, Strategy } from '../../registry.js'; +import type { IPage } from '../../types.js'; +import { + FACTORY_BADGE_PATTERNS, + SERVICE_BADGE_PATTERNS, + assertAuthenticatedState, + buildDetailUrl, + buildProvenance, + canonicalizeSellerUrl, + canonicalizeStoreUrl, + cleanMultilineText, + cleanText, + extractAddress, + extractBadges, + extractMemberId, + extractMetric, + extractOfferId, + extractShopId, + extractYearsOnPlatform, + gotoAndReadState, + guessTopCategories, + resolveStoreUrl, + uniqueNonEmpty, +} from './shared.js'; + +interface StoreBrowserPayload { + href?: string; + title?: string; + bodyText?: string; + offerLinks?: string[]; + contactLinks?: string[]; +} + +interface StoreItemSeed { + href?: string; + bodyText?: string; + seller?: { + companyName?: string; + memberId?: string; + winportUrl?: string; + sellerWinportUrlMap?: Record; + }; + services?: Array<{ serviceName?: string }>; +} + +function normalizeStorePayload(input: { + resolvedUrl: string; + storePayload: StoreBrowserPayload | null; + contactPayload: StoreBrowserPayload | null; + seed: StoreItemSeed | null; + explicitMemberId: string | null; +}): Record { + const storePayload = input.storePayload; + const contactPayload = input.contactPayload; + const seed = input.seed; + + const contactText = cleanMultilineText(contactPayload?.bodyText); + const storeText = cleanMultilineText(storePayload?.bodyText); + const seedText = cleanMultilineText(seed?.bodyText); + const combinedText = [contactText, storeText, seedText].filter(Boolean).join('\n'); + + const sellerUrlRaw = cleanText( + seed?.seller?.winportUrl + ?? seed?.seller?.sellerWinportUrlMap?.defaultUrl + ?? storePayload?.href + ?? input.resolvedUrl, + ); + const storeUrl = safeCanonicalStoreUrl(sellerUrlRaw || input.resolvedUrl) ?? input.resolvedUrl; + const sellerUrl = canonicalizeSellerUrl(sellerUrlRaw) ?? storeUrl; + const companyUrl = pickCompanyUrl(contactPayload?.href, storeUrl); + const memberId = cleanText(seed?.seller?.memberId) + || input.explicitMemberId + || extractMemberId(input.resolvedUrl) + || extractMemberId(storePayload?.href ?? '') + || null; + const shopId = extractShopId(sellerUrl) ?? extractShopId(storeUrl); + const companyName = cleanText(seed?.seller?.companyName) + || firstNamedLine(contactText) + || firstNamedLine(storeText) + || null; + const serviceBadges = uniqueNonEmpty([ + ...extractBadges(combinedText, SERVICE_BADGE_PATTERNS), + ...((seed?.services ?? []).map((service) => cleanText(service.serviceName))), + ]); + const factoryBadges = extractBadges(combinedText, FACTORY_BADGE_PATTERNS); + + return { + member_id: memberId, + shop_id: shopId, + store_name: companyName, + store_url: storeUrl, + company_name: companyName, + company_url: companyUrl, + business_model_text: firstMetric(combinedText, ['经营模式', '生产加工', '主营产品']), + years_on_platform_text: extractYearsOnPlatform(combinedText), + location: extractAddress(contactText) ?? extractAddress(storeText), + staff_size_text: firstMetric(combinedText, ['员工人数', '员工总数']), + factory_badges: factoryBadges, + service_badges: serviceBadges, + response_rate_text: firstMetric(combinedText, ['响应率', '回复率', '响应速度']), + return_rate_text: extractReturnRate(combinedText), + top_categories: guessTopCategories(combinedText), + phone_text: extractMetric(contactText, '电话'), + mobile_text: extractMetric(contactText, '手机'), + ...buildProvenance(cleanText(contactPayload?.href) || cleanText(storePayload?.href) || input.resolvedUrl), + }; +} + +function safeCanonicalStoreUrl(url: string): string | null { + try { + return canonicalizeStoreUrl(url); + } catch { + return null; + } +} + +function pickCompanyUrl(contactHref: string | undefined, storeUrl: string): string | null { + const fromPage = cleanText(contactHref); + if (fromPage) { + const normalized = buildContactUrl(fromPage); + if (normalized) return normalized; + } + return buildContactUrl(storeUrl); +} + +function buildContactUrl(storeUrl: string): string | null { + try { + const parsed = new URL(storeUrl); + if (!parsed.hostname.endsWith('.1688.com')) return null; + return `${parsed.protocol}//${parsed.hostname}/page/contactinfo.html`; + } catch { + return null; + } +} + +function firstNamedLine(text: string): string | null { + return text + .split('\n') + .map((line) => cleanText(line)) + .find((line) => line.includes('有限公司') || line.includes('商行') || line.includes('工厂')) + ?? null; +} + +function firstMetric(text: string, labels: string[]): string | null { + for (const label of labels) { + const value = extractMetric(text, label); + if (value) return value; + } + return null; +} + +function extractReturnRate(text: string): string | null { + const inline = text.match(/回头率\s*([0-9.]+%)/); + if (inline) return cleanText(inline[0]); + const multiline = text.match(/回头率\s*\n\s*([0-9.]+%)/); + if (!multiline) return null; + return `回头率${cleanText(multiline[1])}`; +} + +function firstOfferId(links: string[]): string | null { + for (const link of links) { + const offerId = extractOfferId(link); + if (offerId) return offerId; + } + return null; +} + +function firstContactUrl(links: string[]): string | null { + for (const link of links) { + const url = buildContactUrl(link); + if (url) return url; + } + return null; +} + +async function readStorePayload(page: IPage, url: string, action: string): Promise { + const state = await gotoAndReadState(page, url, 2500, action); + assertAuthenticatedState(state, action); + + return await page.evaluate(` + (() => ({ + href: window.location.href, + title: document.title || '', + bodyText: document.body ? document.body.innerText || '' : '', + offerLinks: Array.from(document.querySelectorAll('a[href*="detail.1688.com/offer/"], a[href*="offerId="]')) + .map((anchor) => anchor.href) + .filter(Boolean), + contactLinks: Array.from(document.querySelectorAll('a[href*="contactinfo"]')) + .map((anchor) => anchor.href) + .filter(Boolean), + }))() + `) as StoreBrowserPayload; +} + +async function readItemSeed(page: IPage, offerId: string): Promise { + const itemUrl = buildDetailUrl(offerId); + const state = await gotoAndReadState(page, itemUrl, 2500, 'store seed item'); + assertAuthenticatedState(state, 'store seed item'); + + const seed = await page.evaluate(` + (() => { + const model = window.context?.result?.global?.globalData?.model ?? null; + const toJson = (value) => JSON.parse(JSON.stringify(value ?? null)); + return { + href: window.location.href, + bodyText: document.body ? document.body.innerText || '' : '', + seller: toJson(model?.sellerModel), + services: toJson(model?.shippingServices?.fields?.buyerProtectionModel ?? []), + }; + })() + `) as StoreItemSeed; + + const hasSellerContext = !!cleanText(seed?.seller?.memberId) || !!cleanText(seed?.seller?.winportUrl); + if (!hasSellerContext) { + throw new CommandExecutionError( + '1688 store seed item did not expose seller context', + '当前 tab 非商品详情上下文,请切到 detail.1688.com 商品页并重试', + ); + } + + return seed; +} + +function hasAnyEvidence( + storePayload: StoreBrowserPayload | null, + contactPayload: StoreBrowserPayload | null, + seed: StoreItemSeed | null, +): boolean { + return !!cleanText(storePayload?.bodyText) + || !!cleanText(contactPayload?.bodyText) + || !!cleanText(seed?.bodyText); +} + +cli({ + site: '1688', + name: 'store', + description: '1688 店铺/供应商公开信息(联系方式、主营、入驻年限、公开服务信号)', + domain: 'www.1688.com', + strategy: Strategy.COOKIE, + navigateBefore: false, + args: [ + { + name: 'input', + required: true, + positional: true, + help: '1688 店铺 URL 或 member ID(如 b2b-22154705262941f196)', + }, + ], + columns: ['store_name', 'years_on_platform_text', 'location', 'return_rate_text'], + func: async (page, kwargs) => { + const rawInput = String(kwargs.input ?? ''); + const resolvedUrl = resolveStoreUrl(rawInput); + const explicitMemberId = extractMemberId(rawInput); + + const storePayload = await readStorePayload(page, resolvedUrl, 'store'); + const contactUrl = firstContactUrl(storePayload.contactLinks ?? []) || buildContactUrl(storePayload.href || resolvedUrl); + const contactPayload = contactUrl ? await readStorePayload(page, contactUrl, 'store contact') : null; + const offerId = extractOfferId(rawInput) + || firstOfferId(storePayload.offerLinks ?? []) + || firstOfferId(contactPayload?.offerLinks ?? []); + + let seed: StoreItemSeed | null = null; + if (offerId) { + try { + seed = await readItemSeed(page, offerId); + } catch (error) { + if (!(error instanceof CommandExecutionError)) throw error; + } + } + + if (!hasAnyEvidence(storePayload, contactPayload, seed)) { + throw new EmptyResultError( + '1688 store', + 'Store page is reachable but no visible fields were extracted. Open the store page in Chrome and retry.', + ); + } + + return [ + normalizeStorePayload({ + resolvedUrl, + storePayload, + contactPayload, + seed, + explicitMemberId, + }), + ]; + }, +}); + +export const __test__ = { + normalizeStorePayload, + safeCanonicalStoreUrl, + buildContactUrl, + firstNamedLine, + firstMetric, + extractReturnRate, + firstOfferId, + firstContactUrl, +};