diff --git a/examples/deployment/docker-compose.yml b/examples/deployment/docker-compose.yml new file mode 100644 index 00000000..5b98fa66 --- /dev/null +++ b/examples/deployment/docker-compose.yml @@ -0,0 +1,39 @@ +services: + html2rss: + image: html2rss/web:latest + env_file: .env + + caddy: + image: caddy:2-alpine + depends_on: + - html2rss + command: + - caddy + - reverse-proxy + - --from + - ${CADDY_HOST} + - --to + - html2rss:3000 + ports: + - "80:80" + - "443:443" + volumes: + - caddy_data:/data + + watchtower: + image: containrrr/watchtower + depends_on: + - html2rss + - caddy + command: + - --cleanup + - --interval + - "300" + - html2rss + - caddy + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + restart: unless-stopped + +volumes: + caddy_data: diff --git a/src/components/FeedDirectory.astro b/src/components/FeedDirectory.astro index f9b88fb6..eed22330 100644 --- a/src/components/FeedDirectory.astro +++ b/src/components/FeedDirectory.astro @@ -1,142 +1,186 @@ --- import { configs } from "../data/loadConfigs"; -import { Icon, LinkButton } from "@astrojs/starlight/components"; - -// Simple helper functions -function getFeedUrl( - config: { - domain: string; - name: string; - url_parameters?: Record; - }, - instanceUrl: string, - params: Record = {}, -) { - const baseUrl = instanceUrl.endsWith("/") ? instanceUrl : `${instanceUrl}/`; - let url = `${baseUrl}${config.domain}/${config.name}.rss`; - - const queryParams = new URLSearchParams(); - Object.keys(config.url_parameters || {}).forEach((key) => { - if (params[key]) queryParams.append(key, params[key]); - }); - - const queryString = queryParams.toString(); - if (queryString) url += `?${queryString}`; - return url; +import { Icon } from "@astrojs/starlight/components"; + +const feedCount = configs.length; + +function formatDefaultParameters(defaultParameters: Record = {}) { + return Object.entries(defaultParameters) + .filter(([, value]) => value) + .map(([key, value]) => `${key}=${value}`) + .join(", "); } -// Don't generate static URLs to avoid exposing instance URL in build const staticFeedUrls = configs.map((config) => ({ ...config, - staticFeedUrl: "#", // Placeholder that will be updated by JavaScript + staticFeedUrl: "#", + defaultSummary: formatDefaultParameters(config.default_parameters), + sourceSummary: config.channel?.url + ?.replace(/^https?:\/\//, "") + .replace(/^www\./, "") + .replace(/\/$/, ""), })); --- -
- -
- - - - - +
+
+
+ +
+ + +
+

+ Search across + {feedCount} + ready-to-use feeds +

+
+ +
+
+ Using instance: + 1.h2r.workers.dev + +
+ + +
+
+ + -
+
{ staticFeedUrls.map((config, index) => ( -
-
-
-
- <> - {config.domain} - / - {config.name} - +
+
+

{config.sourceSummary || `${config.domain}/${config.name}`}

+ + +
- {config.channel?.url && ( -
+
+ {config.defaultSummary ? ( +

Defaults: {config.defaultSummary}

+ ) : ( + + )} + +
+ {config.channel?.url && ( - {config.channel.url} + View source -
- )} -
+ )} + + {Object.keys(config.url_parameters || {}).length > 0 && ( + + )} -
- {!config.valid_channel_url && Object.keys(config.url_parameters || {}).length > 0 ? ( - ) : ( -
- )} - - - - RSS - - - - - Edit - +
- {!config.valid_channel_url && Object.keys(config.url_parameters || {}).length > 0 && ( + {Object.keys(config.url_parameters || {}).length > 0 && ( ))} +
-
)} -
+ )) }
-
+
- diff --git a/src/components/docs/AutoGenerationOptional.astro b/src/components/docs/AutoGenerationOptional.astro index 35039fc9..982682bb 100644 --- a/src/components/docs/AutoGenerationOptional.astro +++ b/src/components/docs/AutoGenerationOptional.astro @@ -2,7 +2,7 @@ import { Aside } from "@astrojs/starlight/components"; --- - --- @@ -160,6 +161,22 @@ html2rss supports many configuration options: 4. **Check the output:** Make sure all items have titles, links, and descriptions +### Useful CLI flags when a site is difficult + +Some sites need a little more request budget than the defaults. + +- Use `--max-redirects` when the site bounces through several canonicalization or tracking redirects before the real page loads. +- Use `--max-requests` when your config needs more than one request, for example pagination or other follow-up fetches. + + + +Keep these values tight. Raise them only when the site proves it needs more. + ## Add It To html2rss-web Once the config works locally, add it to your `feeds.yml` or shared config repository and restart your diff --git a/src/content/docs/feed-directory/index.mdx b/src/content/docs/feed-directory/index.mdx index e0a3bf94..4b75c0c1 100644 --- a/src/content/docs/feed-directory/index.mdx +++ b/src/content/docs/feed-directory/index.mdx @@ -8,21 +8,15 @@ head: content: noindex --- -This directory contains a list of pre-built configurations to create RSS feeds for various websites. - -## Instance URL - -An Instance URL is the address of a running `html2rss-web` application. You can use a public instance, but we encourage you to host your own. - -[πŸš€ Host Your Own Instance (and share it!)](/web-application/how-to/deployment) +import FeedDirectory from "../../../components/FeedDirectory.astro"; -Find more public instances on the [community-run wiki](https://github.com/html2rss/html2rss-web/wiki/Instances). + --- -import FeedDirectory from "../../../components/FeedDirectory.astro"; +Need a different instance? You can use the built-in default, self-host your own, or find more options on the [community-run wiki](https://github.com/html2rss/html2rss-web/wiki/Instances). - +[πŸš€ Host Your Own Instance (and share it!)](/web-application/how-to/deployment) --- @@ -30,4 +24,4 @@ import FeedDirectory from "../../../components/FeedDirectory.astro"; The feed configurations in this directory are community-driven. If you've created a new feed configuration, we encourage you to share it with the community. -[Contribute on GitHub](https://github.com/html2rss/html2rss-configs) +[Contribute on GitHub](https://github.com/html2rss/html2rss-configs/tree/master/lib/html2rss/configs) diff --git a/src/content/docs/getting-started.mdx b/src/content/docs/getting-started.mdx index f08de5ba..c3874cfe 100644 --- a/src/content/docs/getting-started.mdx +++ b/src/content/docs/getting-started.mdx @@ -1,10 +1,12 @@ --- title: "Getting Started" -description: "Learn how to get RSS feeds from any website. Start with existing feeds or create your own in minutes." +description: "Start html2rss-web locally, verify a working included feed from your self-hosted instance, and decide when to enable automatic generation or move to custom configs." sidebar: order: 1 --- +import Code from "astro/components/Code.astro"; + This page points to the main onboarding flow. ## Start Here @@ -14,12 +16,26 @@ If you want the recommended path, go to [Run html2rss-web with Docker](/web-appl That guide is the canonical setup flow for: - running `html2rss-web` locally -- confirming your first successful feed -- deciding when to use included feeds, automatic generation, or custom configs +- confirming the interface is working +- opening a first included feed URL +- deciding when to use automatic generation or custom configs ## Quick Shortcuts -- **[Run html2rss-web with Docker](/web-application/getting-started)** - Recommended first step -- **[Browse working feed examples](/feed-directory/)** - See what success looks like -- **[Create Custom Feeds](/creating-custom-feeds)** - Write configs when you need more control -- **[Troubleshooting Guide](/troubleshooting/troubleshooting)** - Fix startup or extraction problems +- **[Run html2rss-web with Docker](/web-application/getting-started)**: recommended first step +- **[Browse working feed examples](/feed-directory/)**: see what successful outputs look like +- **[Use automatic feed generation](/web-application/how-to/use-automatic-feed-generation/)**: enable direct feed creation from a page URL when you want that workflow +- **[Create Custom Feeds](/creating-custom-feeds)**: write configs when you need more control +- **[Troubleshooting Guide](/troubleshooting/troubleshooting)**: fix startup or extraction problems + +## Using the Ruby CLI + +If you are working directly with the gem instead of `html2rss-web`, start with: + + + +If the target site is unusually redirect-heavy or needs extra follow-up requests, the CLI also supports: + + + +For config-driven runs, the same flags are available on `html2rss feed`. diff --git a/src/content/docs/index.mdx b/src/content/docs/index.mdx index 349bccae..8374aca4 100644 --- a/src/content/docs/index.mdx +++ b/src/content/docs/index.mdx @@ -1,101 +1,69 @@ --- -title: "Turn Any Website Into an RSS Feed - Never Miss Updates Again" -description: "Create RSS feeds from any website - no coding required. Turn blogs, news sites, and forums into RSS feeds you can follow in your favorite reader. Free, open source, and easy to use." +title: "Turn Any Website Into an RSS Feed" +description: "Run html2rss-web with Docker, verify a working included feed from your self-hosted instance, then consciously enable automatic generation or move to custom configs when you need more control." --- -Run `html2rss-web` with Docker, start with included feeds, and add custom configs only when you need more control. +Run `html2rss-web` with Docker, verify a working included feed from your self-hosted instance, and only then decide whether to enable automatic generation or move to custom configs. -## πŸš€ Get Started in 30 Seconds +## Start Here -**Start here:** [Run html2rss-web with Docker](/web-application/getting-started) | [Browse working feed examples](/feed-directory/) +**Recommended path:** [Run html2rss-web with Docker](/web-application/getting-started) -Need more control? [Write a custom feed config](/creating-custom-feeds) +That guide is the canonical onboarding flow for: ---- +- starting a local instance +- verifying the web interface +- opening a first included feed URL +- deciding when to consciously enable automatic generation or move to custom configs ## How It Works 1. **Run your own local instance** with Docker -2. **Use included feeds or add your own** website targets -3. **Subscribe from your RSS reader** using stable feed URLs - ---- - -## Why RSS Still Matters Today - -**Real examples of what you can do:** - -- Follow your favorite blogs without social media algorithms -- Get notified when your local news site posts about your neighborhood -- Track job postings from multiple company websites -- Monitor product updates from software vendors -- Follow academic papers from your field - -**RSS vs Social Media:** - -- βœ… **No algorithms** deciding what you see -- βœ… **No ads** or sponsored content -- βœ… **Works with any feed reader** you choose -- βœ… **Your data stays private** -- βœ… **Never miss updates** - automatic notifications -- βœ… **Save time** - no more manual checking - ---- +2. **Open a built-in feed URL** from your own instance +3. **Copy the feed URL into your reader** ## What is html2rss? -html2rss is a toolkit for turning websites into RSS feeds. Think of it as a translator that converts website content into a format your feed reader can understand. +html2rss is a toolkit for turning websites into feeds. -**Most people should start with the web application:** +Most people should start with the web application: -- **🌐 html2rss-web** - The easiest way to run your own feed server with Docker -- **βš™οΈ html2rss gem** - The underlying engine, CLI, and developer interface +- **`html2rss-web`**: the self-hosted web interface and feed server +- **`html2rss` gem**: the Ruby engine, CLI, and lower-level config workflow ---- - -## 🎯 Choose Your Path +## Choose Your Path ### I want a working instance first -1. **[Run html2rss-web with Docker](/web-application/getting-started)** - Recommended starting path -2. **[Browse working feed examples](/feed-directory/)** - See what success looks like -3. **[Use the included configs](/web-application/how-to/use-included-configs/)** - Start with ready-made feeds +1. **[Run html2rss-web with Docker](/web-application/getting-started)**: recommended starting path +2. **[Use the included configs](/web-application/how-to/use-included-configs/)**: use real embedded feeds from your own instance +3. **[Browse working feed examples](/feed-directory/)**: see what working outputs look like ### I need more control -1. **[Creating Custom Feeds](/creating-custom-feeds)** - Write and test your own configs -2. **[Selectors Reference](/ruby-gem/reference/selectors/)** - Learn the matching rules -3. **[Strategy Reference](/ruby-gem/reference/strategy/)** - Use `browserless` for JS-heavy sites +1. **[Creating Custom Feeds](/creating-custom-feeds)**: write and test your own configs +2. **[Selectors Reference](/ruby-gem/reference/selectors/)**: learn the matching rules +3. **[Strategy Reference](/ruby-gem/reference/strategy/)**: decide when `browserless` is justified ### I'm building or integrating -1. **[Ruby Gem Reference](/ruby-gem/)** - Full API documentation -2. **[Advanced Features](/ruby-gem/how-to/advanced-features/)** - Custom HTTP requests, etc. -3. **[Contribute to Core](/get-involved/contributing/)** - Help improve the engine - ---- - -## 🌟 What People Are Using html2rss For - -- **News & Blogs:** Follow your favorite writers without social media -- **Job Hunting:** Track job postings from multiple company sites -- **Product Updates:** Get notified when software you use gets updated -- **Academic Research:** Follow new papers in your field -- **Local News:** Stay updated on your neighborhood and city -- **Hobby Communities:** Follow forums and communities you care about - -[Browse all examples in our Feed Directory β†’](/feed-directory/) - ---- - -## πŸ”§ Common Issues? +1. **[Ruby Gem Reference](/ruby-gem/)**: full API documentation +2. **[Advanced Features](/ruby-gem/how-to/advanced-features/)**: custom HTTP requests and advanced extraction +3. **[Contribute to Core](/get-involved/contributing/)**: help improve the engine -**Start with Docker, not a public instance.** That gives you the most reliable path and the newest integrated behavior. +## What People Use It For -**Feed not working?** Check our [troubleshooting guide](/troubleshooting/troubleshooting) +- follow blogs and news sites without social media algorithms +- track product updates and release notes +- monitor job postings from company websites +- subscribe to forums and communities that do not publish feeds +- follow local news without repeated manual checking -**Need custom control?** Continue to [Creating Custom Feeds](/creating-custom-feeds) +## Practical Notes -**Need help?** Join our [community discussions](https://github.com/orgs/html2rss/discussions) +- Start with Docker, not a public instance. +- Use an included feed to verify the deployment first. +- Enable automatic generation only when you want the direct page-URL workflow and are ready to allow it on your self-hosted instance. +- Move to custom configs when you need a stable, reviewable setup. -**Found a bug?** [Report it on GitHub](https://github.com/html2rss/html2rss/issues) +**Need help?** Continue to the [troubleshooting guide](/troubleshooting/troubleshooting) or join [GitHub Discussions](https://github.com/orgs/html2rss/discussions). diff --git a/src/content/docs/ruby-gem/how-to/advanced-features.mdx b/src/content/docs/ruby-gem/how-to/advanced-features.mdx index 703bd9e9..7d1088b3 100644 --- a/src/content/docs/ruby-gem/how-to/advanced-features.mdx +++ b/src/content/docs/ruby-gem/how-to/advanced-features.mdx @@ -7,13 +7,7 @@ This guide covers advanced features and performance optimizations for html2rss. ## Parallel Processing -html2rss uses parallel processing to improve performance when scraping multiple items. This happens automatically and doesn't require any configuration. - -### How It Works - -- **Auto-source scraping:** Multiple scrapers run in parallel to analyze the page -- **Item processing:** Each scraped item is processed in parallel -- **Performance benefit:** Significantly faster when dealing with many items +html2rss uses parallel processing in auto-source discovery. This happens automatically and doesn't require any configuration. ### Performance Tips @@ -88,7 +82,7 @@ LOG_LEVEL=debug html2rss feed config.yml Use the health check endpoint to monitor feed generation: ```bash -curl -u username:password http://localhost:3000/health_check.txt +curl -u username:password http://localhost:4000/health_check.txt ``` ## Article Validation diff --git a/src/content/docs/ruby-gem/how-to/custom-http-requests.mdx b/src/content/docs/ruby-gem/how-to/custom-http-requests.mdx index 33b6cca3..23fdfa7b 100644 --- a/src/content/docs/ruby-gem/how-to/custom-http-requests.mdx +++ b/src/content/docs/ruby-gem/how-to/custom-http-requests.mdx @@ -3,7 +3,15 @@ title: "Custom HTTP Requests" description: "Learn how to customize HTTP requests with custom headers, authentication, and API interactions for html2rss." --- -Some websites require custom HTTP headers, authentication, or other request settings to access their content. `html2rss` lets you customize requests for those cases. +import Code from "astro/components/Code.astro"; + +Some sites only work when requests carry the headers, tokens, or cookies your browser uses. `html2rss` supports those cases without changing the rest of your feed workflow. + +Keep this structure in mind: + +- `headers` stays top-level +- `strategy` stays top-level +- request-specific controls such as budgets and Browserless options live under `request` ## When You Need Custom Headers @@ -19,8 +27,8 @@ You might need custom HTTP requests when: Add a `headers` section to your feed configuration. This example is a complete, valid config: -```yaml -headers: + + +## Request Controls + +Request budgets are configured under `request`, not as top-level keys: + + + +- `request.max_redirects` limits redirect hops +- `request.max_requests` limits the total request budget for the feed build +- `request.browserless.*` is reserved for Browserless-only behavior such as preload actions ## Common Use Cases diff --git a/src/content/docs/ruby-gem/how-to/handling-dynamic-content.mdx b/src/content/docs/ruby-gem/how-to/handling-dynamic-content.mdx index c0e5e379..2ca0db72 100644 --- a/src/content/docs/ruby-gem/how-to/handling-dynamic-content.mdx +++ b/src/content/docs/ruby-gem/how-to/handling-dynamic-content.mdx @@ -3,12 +3,38 @@ title: Handling Dynamic Content description: "Learn how to handle JavaScript-heavy websites and dynamic content with html2rss. Use browserless strategy for sites that load content dynamically." --- +import Code from "astro/components/Code.astro"; + Some websites load their content dynamically using JavaScript. The default `html2rss` strategy might not see this content. ## Solution Use the [`browserless` strategy](/ruby-gem/reference/strategy) to render JavaScript-heavy websites with a headless browser. +Keep the strategy at the top level and put request-specific options under `request`: + + + ## When to Use Browserless The `browserless` strategy is necessary when: @@ -18,6 +44,53 @@ The `browserless` strategy is necessary when: - **Infinite scroll** - Content loads as you scroll - **Dynamic forms** - Content changes based on user interaction +## Preload Actions + +For dynamic sites, rendering once is often not enough. Use `request.browserless.preload` to wait, click, or scroll before the +HTML snapshot is taken. + +### Wait for JavaScript Requests + +```yaml +strategy: browserless +request: + browserless: + preload: + wait_for_network_idle: + timeout_ms: 4000 +``` + +### Click "Load More" Buttons + +```yaml +strategy: browserless +request: + browserless: + preload: + click_selectors: + - selector: ".load-more" + max_clicks: 3 + delay_ms: 250 + wait_for_network_idle: + timeout_ms: 3000 +``` + +### Scroll Infinite Lists + +```yaml +strategy: browserless +request: + browserless: + preload: + scroll_down: + iterations: 5 + delay_ms: 200 + wait_for_network_idle: + timeout_ms: 2500 +``` + +These preload steps can be combined in a single config when a site needs several interactions before all items appear. + ## Performance Considerations The `browserless` strategy is slower than the default `faraday` strategy because it: diff --git a/src/content/docs/ruby-gem/reference/auto-source.mdx b/src/content/docs/ruby-gem/reference/auto-source.mdx index 33454232..82e92df0 100644 --- a/src/content/docs/ruby-gem/reference/auto-source.mdx +++ b/src/content/docs/ruby-gem/reference/auto-source.mdx @@ -17,16 +17,19 @@ auto_source: {} `auto_source` uses the following strategies to find content: -1. **`schema`:** Parses `