diff --git a/package-lock.json b/package-lock.json index 33295d6..edddade 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,10 +9,14 @@ "version": "0.1.0", "license": "MIT", "dependencies": { - "turndown": "^7.2.2" + "@mozilla/readability": "0.6.0", + "jsdom": "29.1.1", + "turndown": "^7.2.2", + "undici": "7.28.0" }, "devDependencies": { "@biomejs/biome": "2.3.5", + "@types/jsdom": "28.0.3", "@types/node": "^22.10.5", "@types/turndown": "^5.0.5", "@typescript/native-preview": "^7.0.0-dev.20260120.1", @@ -20,7 +24,7 @@ "vitest": "^4.1.5" }, "engines": { - "node": ">=20.0.0" + "node": ">=20.19.0" }, "peerDependencies": { "@mariozechner/pi-ai": "*", @@ -50,6 +54,53 @@ } } }, + "node_modules/@asamuzakjp/css-color": { + "version": "5.1.11", + "resolved": "https://registry.npmjs.org/@asamuzakjp/css-color/-/css-color-5.1.11.tgz", + "integrity": "sha512-KVw6qIiCTUQhByfTd78h2yD1/00waTmm9uy/R7Ck/ctUyAPj+AEDLkQIdJW0T8+qGgj3j5bpNKK7Q3G+LedJWg==", + "license": "MIT", + "dependencies": { + "@asamuzakjp/generational-cache": "^1.0.1", + "@csstools/css-calc": "^3.2.0", + "@csstools/css-color-parser": "^4.1.0", + "@csstools/css-parser-algorithms": "^4.0.0", + "@csstools/css-tokenizer": "^4.0.0" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/@asamuzakjp/dom-selector": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/@asamuzakjp/dom-selector/-/dom-selector-7.1.1.tgz", + "integrity": "sha512-67RZDnYRc8H/8MLDgQCDE//zoqVFwajkepHZgmXrbwybzXOEwOWGPYGmALYl9J2DOLfFPPs6kKCqmbzV895hTQ==", + "license": "MIT", + "dependencies": { + "@asamuzakjp/generational-cache": "^1.0.1", + "@asamuzakjp/nwsapi": "^2.3.9", + "bidi-js": "^1.0.3", + "css-tree": "^3.2.1", + "is-potential-custom-element-name": "^1.0.1" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/@asamuzakjp/generational-cache": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@asamuzakjp/generational-cache/-/generational-cache-1.0.1.tgz", + "integrity": "sha512-wajfB8KqzMCN2KGNFdLkReeHncd0AslUSrvHVvvYWuU8ghncRJoA50kT3zP9MVL0+9g4/67H+cdvBskj9THPzg==", + "license": "MIT", + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/@asamuzakjp/nwsapi": { + "version": "2.3.9", + "resolved": "https://registry.npmjs.org/@asamuzakjp/nwsapi/-/nwsapi-2.3.9.tgz", + "integrity": "sha512-n8GuYSrI9bF7FFZ/SjhwevlHc8xaVlb/7HmHelnc/PZXBD2ZR49NnN9sMMuDdEGPeeRQ5d0hqlSlEpgCX3Wl0Q==", + "license": "MIT" + }, "node_modules/@aws-crypto/crc32": { "version": "5.2.0", "resolved": "https://registry.npmjs.org/@aws-crypto/crc32/-/crc32-5.2.0.tgz", @@ -1059,6 +1110,152 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/@bramus/specificity": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/@bramus/specificity/-/specificity-2.4.2.tgz", + "integrity": "sha512-ctxtJ/eA+t+6q2++vj5j7FYX3nRu311q1wfYH3xjlLOsczhlhxAg2FWNUXhpGvAw3BWo1xBcvOV6/YLc2r5FJw==", + "license": "MIT", + "dependencies": { + "css-tree": "^3.0.0" + }, + "bin": { + "specificity": "bin/cli.js" + } + }, + "node_modules/@csstools/color-helpers": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/@csstools/color-helpers/-/color-helpers-6.0.2.tgz", + "integrity": "sha512-LMGQLS9EuADloEFkcTBR3BwV/CGHV7zyDxVRtVDTwdI2Ca4it0CCVTT9wCkxSgokjE5Ho41hEPgb8OEUwoXr6Q==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT-0", + "engines": { + "node": ">=20.19.0" + } + }, + "node_modules/@csstools/css-calc": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/@csstools/css-calc/-/css-calc-3.2.1.tgz", + "integrity": "sha512-DtdHlgXh5ZkA43cwBcAm+huzgJiwx3ZTWVjBs94kwz2xKqSimDA3lBgCjphYgwgVUMWatSM0pDd8TILB1yrVVg==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=20.19.0" + }, + "peerDependencies": { + "@csstools/css-parser-algorithms": "^4.0.0", + "@csstools/css-tokenizer": "^4.0.0" + } + }, + "node_modules/@csstools/css-color-parser": { + "version": "4.1.8", + "resolved": "https://registry.npmjs.org/@csstools/css-color-parser/-/css-color-parser-4.1.8.tgz", + "integrity": "sha512-3chWb7PRLijpJpPIKkDxdu6IBeO5MrFACND57On0j8OPpc0wZibcGc3xAHrSEbOx/KDRyMHoIxGn0w1PhXMYHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "dependencies": { + "@csstools/color-helpers": "^6.0.2", + "@csstools/css-calc": "^3.2.1" + }, + "engines": { + "node": ">=20.19.0" + }, + "peerDependencies": { + "@csstools/css-parser-algorithms": "^4.0.0", + "@csstools/css-tokenizer": "^4.0.0" + } + }, + "node_modules/@csstools/css-parser-algorithms": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/@csstools/css-parser-algorithms/-/css-parser-algorithms-4.0.0.tgz", + "integrity": "sha512-+B87qS7fIG3L5h3qwJ/IFbjoVoOe/bpOdh9hAjXbvx0o8ImEmUsGXN0inFOnk2ChCFgqkkGFQ+TpM5rbhkKe4w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=20.19.0" + }, + "peerDependencies": { + "@csstools/css-tokenizer": "^4.0.0" + } + }, + "node_modules/@csstools/css-syntax-patches-for-csstree": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/@csstools/css-syntax-patches-for-csstree/-/css-syntax-patches-for-csstree-1.1.5.tgz", + "integrity": "sha512-oNjBvzLq2GPZtJphCjLqXow/cHySHSgtxvKZb7OqSZ/xHgw6NWNhfad+6AB9cLeVm6eA9d/qMll3JdEHjy6M+A==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT-0", + "peerDependencies": { + "css-tree": "^3.2.1" + }, + "peerDependenciesMeta": { + "css-tree": { + "optional": true + } + } + }, + "node_modules/@csstools/css-tokenizer": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/@csstools/css-tokenizer/-/css-tokenizer-4.0.0.tgz", + "integrity": "sha512-QxULHAm7cNu72w97JUNCBFODFaXpbDg+dP8b/oWFAZ2MTRppA3U00Y2L1HqaS4J6yBqxwa/Y3nMBaxVKbB/NsA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=20.19.0" + } + }, "node_modules/@emnapi/core": { "version": "1.10.0", "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.10.0.tgz", @@ -1093,6 +1290,23 @@ "tslib": "^2.4.0" } }, + "node_modules/@exodus/bytes": { + "version": "1.15.1", + "resolved": "https://registry.npmjs.org/@exodus/bytes/-/bytes-1.15.1.tgz", + "integrity": "sha512-S6mL0yNB/Abt9Ei4tq8gDhcczc4S3+vQ4ra7vxnAf+YHC02srtqxKKZghx2Dq6p0e66THKwR6r8N6P95wEty7Q==", + "license": "MIT", + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + }, + "peerDependencies": { + "@noble/hashes": "^1.8.0 || ^2.0.0" + }, + "peerDependenciesMeta": { + "@noble/hashes": { + "optional": true + } + } + }, "node_modules/@google/genai": { "version": "1.51.0", "resolved": "https://registry.npmjs.org/@google/genai/-/genai-1.51.0.tgz", @@ -1461,6 +1675,15 @@ "integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==", "license": "BSD-2-Clause" }, + "node_modules/@mozilla/readability": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.6.0.tgz", + "integrity": "sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==", + "license": "Apache-2.0", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@napi-rs/wasm-runtime": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-1.1.4.tgz", @@ -2623,6 +2846,39 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/jsdom": { + "version": "28.0.3", + "resolved": "https://registry.npmjs.org/@types/jsdom/-/jsdom-28.0.3.tgz", + "integrity": "sha512-/HQ2uFoetFTXuye8vzIcHw2z6Fwi7Hi/qcgC+RoS9NCyewiqxhVGqlG+ViGB6lkax481R6dmhf1I7lIGlzJStQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*", + "@types/tough-cookie": "*", + "parse5": "^8.0.0", + "undici-types": "^7.21.0" + } + }, + "node_modules/@types/jsdom/node_modules/parse5": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-8.0.1.tgz", + "integrity": "sha512-z1e/HMG90obSGeidlli3hj7cbocou0/wa5HacvI3ASx34PecNjNQeaHNo5WIZpWofN9kgkqV1q5YvXe3F0FoPw==", + "dev": true, + "license": "MIT", + "dependencies": { + "entities": "^8.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/@types/jsdom/node_modules/undici-types": { + "version": "7.28.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.28.0.tgz", + "integrity": "sha512-LJAfY+2w6HGeT8d8J1wNQsUGUEGio6NWWpwdwurQe4f6oojzCFuGLizl1KSve4irsTxyLly1QhEeE6iapdaIvQ==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/mime-types": { "version": "2.1.4", "resolved": "https://registry.npmjs.org/@types/mime-types/-/mime-types-2.1.4.tgz", @@ -2646,6 +2902,13 @@ "license": "MIT", "peer": true }, + "node_modules/@types/tough-cookie": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.5.tgz", + "integrity": "sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/turndown": { "version": "5.0.6", "resolved": "https://registry.npmjs.org/@types/turndown/-/turndown-5.0.6.tgz", @@ -3004,6 +3267,15 @@ "node": ">=10.0.0" } }, + "node_modules/bidi-js": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/bidi-js/-/bidi-js-1.0.3.tgz", + "integrity": "sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw==", + "license": "MIT", + "dependencies": { + "require-from-string": "^2.0.2" + } + }, "node_modules/bignumber.js": { "version": "9.3.1", "resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.3.1.tgz", @@ -3175,6 +3447,19 @@ "dev": true, "license": "MIT" }, + "node_modules/css-tree": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/css-tree/-/css-tree-3.2.1.tgz", + "integrity": "sha512-X7sjQzceUhu1u7Y/ylrRZFU2FS6LRiFVp6rKLPg23y3x3c3DOKAwuXGDp+PAGjh6CSnCjYeAul8pcT8bAl+lSA==", + "license": "MIT", + "dependencies": { + "mdn-data": "2.27.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12.20.0 || ^14.13.0 || >=15.0.0" + } + }, "node_modules/data-uri-to-buffer": { "version": "4.0.1", "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-4.0.1.tgz", @@ -3185,6 +3470,19 @@ "node": ">= 12" } }, + "node_modules/data-urls": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-7.0.0.tgz", + "integrity": "sha512-23XHcCF+coGYevirZceTVD7NdJOqVn+49IHyxgszm+JIiHLoB2TkmPtsYkNWT1pvRSGkc35L6NHs0yHkN2SumA==", + "license": "MIT", + "dependencies": { + "whatwg-mimetype": "^5.0.0", + "whatwg-url": "^16.0.0" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, "node_modules/debug": { "version": "4.4.3", "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", @@ -3203,6 +3501,12 @@ } } }, + "node_modules/decimal.js": { + "version": "10.6.0", + "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.6.0.tgz", + "integrity": "sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==", + "license": "MIT" + }, "node_modules/degenerator": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/degenerator/-/degenerator-5.0.1.tgz", @@ -3265,6 +3569,18 @@ "once": "^1.4.0" } }, + "node_modules/entities": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-8.0.0.tgz", + "integrity": "sha512-zwfzJecQ/Uej6tusMqwAqU/6KL2XaB2VZ2Jg54Je6ahNBGNH6Ek6g3jjNCF0fG9EWQKGZNddNjU5F1ZQn/sBnA==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=20.19.0" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/es-module-lexer": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-2.1.0.tgz", @@ -3704,6 +4020,18 @@ "node": "^20.17.0 || >=22.9.0" } }, + "node_modules/html-encoding-sniffer": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-6.0.0.tgz", + "integrity": "sha512-CV9TW3Y3f8/wT0BRFc1/KAVQ3TUHiXmaAb6VW9vtiMFf7SLoMd1PdAc4W3KFOFETBJUb90KatHqlsZMWV+R9Gg==", + "license": "MIT", + "dependencies": { + "@exodus/bytes": "^1.6.0" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, "node_modules/http-proxy-agent": { "version": "7.0.2", "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", @@ -3783,6 +4111,64 @@ "node": ">=8" } }, + "node_modules/is-potential-custom-element-name": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", + "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==", + "license": "MIT" + }, + "node_modules/jsdom": { + "version": "29.1.1", + "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-29.1.1.tgz", + "integrity": "sha512-ECi4Fi2f7BdJtUKTflYRTiaMxIB0O6zfR1fX0GXpUrf6flp8QIYn1UT20YQqdSOfk2dfkCwS8LAFoJDEppNK5Q==", + "license": "MIT", + "dependencies": { + "@asamuzakjp/css-color": "^5.1.11", + "@asamuzakjp/dom-selector": "^7.1.1", + "@bramus/specificity": "^2.4.2", + "@csstools/css-syntax-patches-for-csstree": "^1.1.3", + "@exodus/bytes": "^1.15.0", + "css-tree": "^3.2.1", + "data-urls": "^7.0.0", + "decimal.js": "^10.6.0", + "html-encoding-sniffer": "^6.0.0", + "is-potential-custom-element-name": "^1.0.1", + "lru-cache": "^11.3.5", + "parse5": "^8.0.1", + "saxes": "^6.0.0", + "symbol-tree": "^3.2.4", + "tough-cookie": "^6.0.1", + "undici": "^7.25.0", + "w3c-xmlserializer": "^5.0.0", + "webidl-conversions": "^8.0.1", + "whatwg-mimetype": "^5.0.0", + "whatwg-url": "^16.0.1", + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": "^20.19.0 || ^22.13.0 || >=24.0.0" + }, + "peerDependencies": { + "canvas": "^3.0.0" + }, + "peerDependenciesMeta": { + "canvas": { + "optional": true + } + } + }, + "node_modules/jsdom/node_modules/parse5": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-8.0.1.tgz", + "integrity": "sha512-z1e/HMG90obSGeidlli3hj7cbocou0/wa5HacvI3ASx34PecNjNQeaHNo5WIZpWofN9kgkqV1q5YvXe3F0FoPw==", + "license": "MIT", + "dependencies": { + "entities": "^8.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/json-bigint": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/json-bigint/-/json-bigint-1.0.0.tgz", @@ -4127,7 +4513,6 @@ "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-11.3.5.tgz", "integrity": "sha512-NxVFwLAnrd9i7KUBxC4DrUhmgjzOs+1Qm50D3oF1/oL+r1NpZ4gA7xvG0/zJ8evR7zIKn4vLf7qTNduWFtCrRw==", "license": "BlueOak-1.0.0", - "peer": true, "engines": { "node": "20 || >=22" } @@ -4155,6 +4540,12 @@ "node": ">= 18" } }, + "node_modules/mdn-data": { + "version": "2.27.1", + "resolved": "https://registry.npmjs.org/mdn-data/-/mdn-data-2.27.1.tgz", + "integrity": "sha512-9Yubnt3e8A0OKwxYSXyhLymGW4sCufcLG6VdiDdUGVkPhpqLxlvP5vl1983gQjJl3tqbrM731mjaZaP68AgosQ==", + "license": "CC0-1.0" + }, "node_modules/mime-db": { "version": "1.54.0", "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.54.0.tgz", @@ -4619,6 +5010,15 @@ "once": "^1.3.1" } }, + "node_modules/punycode": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/require-directory": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", @@ -4629,6 +5029,15 @@ "node": ">=0.10.0" } }, + "node_modules/require-from-string": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz", + "integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/retry": { "version": "0.13.1", "resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz", @@ -4694,6 +5103,18 @@ "license": "MIT", "peer": true }, + "node_modules/saxes": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz", + "integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==", + "license": "ISC", + "dependencies": { + "xmlchars": "^2.2.0" + }, + "engines": { + "node": ">=v12.22.7" + } + }, "node_modules/siginfo": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/siginfo/-/siginfo-2.0.0.tgz", @@ -4764,7 +5185,6 @@ "version": "1.2.1", "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", - "dev": true, "license": "BSD-3-Clause", "engines": { "node": ">=0.10.0" @@ -4881,6 +5301,12 @@ "node": ">=8" } }, + "node_modules/symbol-tree": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz", + "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==", + "license": "MIT" + }, "node_modules/thenify": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz", @@ -4948,6 +5374,24 @@ "node": ">=14.0.0" } }, + "node_modules/tldts": { + "version": "7.4.3", + "resolved": "https://registry.npmjs.org/tldts/-/tldts-7.4.3.tgz", + "integrity": "sha512-A3BDQBeeukYPzB4QdQ1DtdlUmp4x2OCH8n5UVhEWbyANxNep8GavottKzd1xYKFJKjUgMyPT7EzOfnBO55s8Sg==", + "license": "MIT", + "dependencies": { + "tldts-core": "^7.4.3" + }, + "bin": { + "tldts": "bin/cli.js" + } + }, + "node_modules/tldts-core": { + "version": "7.4.3", + "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-7.4.3.tgz", + "integrity": "sha512-27ep5H9PzdBrNd5OFM/j3WCU8F3kPwM9D0BOaOf7uYfxMJfyr0K5Tjj69Gri+sZlh2WXd5buIm47NuPF29CDiw==", + "license": "MIT" + }, "node_modules/token-types": { "version": "6.1.2", "resolved": "https://registry.npmjs.org/token-types/-/token-types-6.1.2.tgz", @@ -4967,6 +5411,30 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/tough-cookie": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-6.0.1.tgz", + "integrity": "sha512-LktZQb3IeoUWB9lqR5EWTHgW/VTITCXg4D21M+lvybRVdylLrRMnqaIONLVb5mav8vM19m44HIcGq4qASeu2Qw==", + "license": "BSD-3-Clause", + "dependencies": { + "tldts": "^7.0.5" + }, + "engines": { + "node": ">=16" + } + }, + "node_modules/tr46": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-6.0.0.tgz", + "integrity": "sha512-bLVMLPtstlZ4iMQHpFHTR7GAGj2jxi8Dg0s2h2MafAE4uSWF98FC/3MomU51iQAMf8/qDUbKWf5GxuvvVcXEhw==", + "license": "MIT", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=20" + } + }, "node_modules/ts-algebra": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ts-algebra/-/ts-algebra-2.0.0.tgz", @@ -5028,11 +5496,10 @@ } }, "node_modules/undici": { - "version": "7.25.0", - "resolved": "https://registry.npmjs.org/undici/-/undici-7.25.0.tgz", - "integrity": "sha512-xXnp4kTyor2Zq+J1FfPI6Eq3ew5h6Vl0F/8d9XU5zZQf1tX9s2Su1/3PiMmUANFULpmksxkClamIZcaUqryHsQ==", + "version": "7.28.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-7.28.0.tgz", + "integrity": "sha512-cRZYrTDwWznlnRiPjggAGxZXanty6M8RV1ff8Wm4LWXBp7/IG8v5DnOm74DtUBp9OONpK75YlPnIjQqX0dBDtA==", "license": "MIT", - "peer": true, "engines": { "node": ">=20.18.1" } @@ -5232,6 +5699,18 @@ "dev": true, "license": "MIT" }, + "node_modules/w3c-xmlserializer": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz", + "integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==", + "license": "MIT", + "dependencies": { + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/web-streams-polyfill": { "version": "3.3.3", "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-3.3.3.tgz", @@ -5242,6 +5721,38 @@ "node": ">= 8" } }, + "node_modules/webidl-conversions": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-8.0.1.tgz", + "integrity": "sha512-BMhLD/Sw+GbJC21C/UgyaZX41nPt8bUTg+jWyDeg7e7YN4xOM05YPSIXceACnXVtqyEw/LMClUQMtMZ+PGGpqQ==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=20" + } + }, + "node_modules/whatwg-mimetype": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-5.0.0.tgz", + "integrity": "sha512-sXcNcHOC51uPGF0P/D4NVtrkjSU2fNsm9iog4ZvZJsL3rjoDAzXZhkm2MWt1y+PUdggKAYVoMAIYcs78wJ51Cw==", + "license": "MIT", + "engines": { + "node": ">=20" + } + }, + "node_modules/whatwg-url": { + "version": "16.0.1", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-16.0.1.tgz", + "integrity": "sha512-1to4zXBxmXHV3IiSSEInrreIlu02vUOvrhxJJH5vcxYTBDAx51cqZiKdyTxlecdKNSjj8EcxGBxNf6Vg+945gw==", + "license": "MIT", + "dependencies": { + "@exodus/bytes": "^1.11.0", + "tr46": "^6.0.0", + "webidl-conversions": "^8.0.1" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, "node_modules/why-is-node-running": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/why-is-node-running/-/why-is-node-running-2.3.0.tgz", @@ -5329,6 +5840,15 @@ } } }, + "node_modules/xml-name-validator": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz", + "integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==", + "license": "Apache-2.0", + "engines": { + "node": ">=18" + } + }, "node_modules/xml-naming": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/xml-naming/-/xml-naming-0.1.0.tgz", @@ -5345,6 +5865,12 @@ "node": ">=16.0.0" } }, + "node_modules/xmlchars": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", + "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==", + "license": "MIT" + }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", diff --git a/package.json b/package.json index 5322205..5446348 100644 --- a/package.json +++ b/package.json @@ -38,10 +38,14 @@ "typebox": "*" }, "dependencies": { - "turndown": "^7.2.2" + "@mozilla/readability": "0.6.0", + "jsdom": "29.1.1", + "turndown": "^7.2.2", + "undici": "7.28.0" }, "devDependencies": { "@biomejs/biome": "2.3.5", + "@types/jsdom": "28.0.3", "@types/node": "^22.10.5", "@types/turndown": "^5.0.5", "@typescript/native-preview": "^7.0.0-dev.20260120.1", @@ -49,6 +53,6 @@ "vitest": "^4.1.5" }, "engines": { - "node": ">=20.0.0" + "node": ">=20.19.0" } } diff --git a/src/webfetch/content.ts b/src/webfetch/content.ts index fdd8c96..fba9a0b 100644 --- a/src/webfetch/content.ts +++ b/src/webfetch/content.ts @@ -1,5 +1,13 @@ +import { Readability } from "@mozilla/readability"; +import { JSDOM, VirtualConsole } from "jsdom"; import TurndownService from "turndown"; +interface ReadableArticle { + readonly title: string; + readonly content: string; + readonly hasHeading: boolean; +} + const TAGS_TO_REMOVE = /<(script|style|noscript|iframe|object|embed|meta|link)\b[^>]*>[\s\S]*?<\/\1>/gi; const VOID_TAGS_TO_REMOVE = /<(script|style|noscript|iframe|object|embed|meta|link)\b[^>]*\/?>/gi; const BLOCK_BREAK_TAGS = @@ -26,11 +34,28 @@ const turndownService = new TurndownService({ }); turndownService.remove(["script", "style", "noscript", "iframe", "object", "embed", "meta", "link"]); -export function htmlToMarkdown(html: string): string { - return turndownService.turndown(html).trim(); +export function htmlToMarkdown(html: string, url: string): string { + const article = extractReadableArticle(html, url); + if (!article) return turndownService.turndown(html).trim(); + + const markdown = turndownService.turndown(article.content).trim(); + if (!article.title || article.hasHeading || markdown.startsWith(`# ${article.title}`)) return markdown; + return `# ${article.title}\n\n${markdown}`.trim(); } -export function htmlToText(html: string): string { +export function htmlToText(html: string, url: string): string { + const article = extractReadableArticle(html, url); + if (article) { + const body = htmlFragmentToPlainText(article.content); + if (!article.title || article.hasHeading) return body; + if (body.startsWith(article.title)) return body; + return `${article.title}\n\n${body}`.trim(); + } + + return htmlFragmentToPlainText(html); +} + +function htmlFragmentToPlainText(html: string): string { return decodeHtmlEntities( html .replace(TAGS_TO_REMOVE, "") @@ -45,6 +70,44 @@ export function htmlToText(html: string): string { ); } +function extractReadableArticle(html: string, url: string): ReadableArticle | undefined { + try { + const dom = new JSDOM(html, { + url, + contentType: "text/html", + virtualConsole: new VirtualConsole(), + }); + try { + const article = new Readability(dom.window.document, { + charThreshold: 80, + keepClasses: false, + }).parse(); + if (!article?.content || !article.textContent) return undefined; + return { + title: normalizePlainText(article.title ?? ""), + content: article.content, + hasHeading: / { if (entity.startsWith("#x")) { diff --git a/src/webfetch/tool.ts b/src/webfetch/tool.ts index 335281c..2fb4cfa 100644 --- a/src/webfetch/tool.ts +++ b/src/webfetch/tool.ts @@ -73,10 +73,10 @@ export const webfetch = defineTool({ let converted = false; if (isHtml && format === "markdown") { - text = htmlToMarkdown(raw); + text = htmlToMarkdown(raw, fetched.url); converted = true; } else if (isHtml && format === "text") { - text = htmlToText(raw); + text = htmlToText(raw, fetched.url); converted = true; } diff --git a/test/webfetch.test.ts b/test/webfetch.test.ts index c841891..f0be352 100644 --- a/test/webfetch.test.ts +++ b/test/webfetch.test.ts @@ -108,6 +108,50 @@ describe("webfetch", () => { expect(result.details?.status).toBe(200); }); + it("#given article page with chrome #when fetching markdown #then returns reader-style main content", async () => { + // given + const server = await createFixtureServer((_request, response) => { + response.writeHead(200, { "content-type": "text/html; charset=utf-8" }); + response.end(` + + + Browser chrome should not win + + + + +
Subscribe now
+ +
+
+

Readable Article Title

+

Alpha opening paragraph with enough words to look like article content, not navigation.

+

Beta paragraph explains the important result and should stay in the fetched content.

+
+
+ +
Privacy policy and cookie settings
+ + + `); + }); + + // when + const result = await executeWebfetch({ url: `${server.baseUrl}/article`, format: "markdown" }); + const text = textContent(result); + + // then + expect(text).toContain("# Readable Article Title"); + expect(text).toContain("Alpha opening paragraph"); + expect(text).toContain("Beta paragraph explains"); + expect(text).not.toContain("Browser chrome should not win"); + expect(text).not.toContain("Subscribe now"); + expect(text).not.toContain("Topics"); + expect(text).not.toContain("Sponsored sidebar offer"); + expect(text).not.toContain("Privacy policy"); + expect(text).not.toContain("window.tracker"); + }); + it("#given html page #when fetching text #then returns readable text without tags", async () => { // given const server = await createFixtureServer((_request, response) => { @@ -125,6 +169,50 @@ describe("webfetch", () => { expect(result.details?.format).toBe("text"); }); + it("#given article page with chrome #when fetching text #then returns reader-style main content", async () => { + // given + const server = await createFixtureServer((_request, response) => { + response.writeHead(200, { "content-type": "text/html; charset=utf-8" }); + response.end(` + + + Text chrome title + + + +
Text subscribe banner
+ +
+
+

Readable Text Article

+

Gamma text paragraph with enough words to be selected as article content.

+

Delta text paragraph should stay after reader cleanup.

+
+
+ +
Text footer legal links
+ + + `); + }); + + // when + const result = await executeWebfetch({ url: `${server.baseUrl}/text-article`, format: "text" }); + const text = textContent(result); + + // then + expect(text).toContain("Readable Text Article"); + expect(text).toContain("Gamma text paragraph"); + expect(text).toContain("Delta text paragraph"); + expect(text).not.toContain("Text chrome title"); + expect(text).not.toContain("Text subscribe banner"); + expect(text).not.toContain("Latest text link"); + expect(text).not.toContain("Text sponsored sidebar"); + expect(text).not.toContain("Text footer legal links"); + expect(text).not.toContain("textTracker"); + expect(result.details?.format).toBe("text"); + }); + it("#given html page #when fetching html #then returns raw html", async () => { // given const html = "

Raw

HTML

";