diff --git a/docs/_ext/package_reference.py b/docs/_ext/package_reference.py index 16bfe3ca..64a7464c 100644 --- a/docs/_ext/package_reference.py +++ b/docs/_ext/package_reference.py @@ -204,6 +204,7 @@ class PackageDocsRecord: "sphinx-vite-builder": "build-seo", "sphinx-gp-opengraph": "build-seo", "sphinx-gp-sitemap": "build-seo", + "sphinx-gp-llms": "build-seo", } diff --git a/docs/packages/sphinx-gp-llms/index.md b/docs/packages/sphinx-gp-llms/index.md new file mode 100644 index 00000000..df5b3e5b --- /dev/null +++ b/docs/packages/sphinx-gp-llms/index.md @@ -0,0 +1,6 @@ +(sphinx-gp-llms)= + +# sphinx-gp-llms + +```{package-landing} sphinx-gp-llms +``` diff --git a/docs/redirects.txt b/docs/redirects.txt index 7f7971e3..ac574907 100644 --- a/docs/redirects.txt +++ b/docs/redirects.txt @@ -1,5 +1,6 @@ extensions/sphinx-gp-opengraph packages/sphinx-gp-opengraph extensions/sphinx-gp-sitemap packages/sphinx-gp-sitemap +extensions/sphinx-gp-llms packages/sphinx-gp-llms extensions/gp-sphinx packages/gp-sphinx extensions/index packages/index extensions/sphinx-autodoc-argparse packages/sphinx-autodoc-argparse diff --git a/packages/gp-furo-theme/src/gp_furo_theme/theme/gp-furo/page.html b/packages/gp-furo-theme/src/gp_furo_theme/theme/gp-furo/page.html index d288eb56..309b4049 100644 --- a/packages/gp-furo-theme/src/gp_furo_theme/theme/gp-furo/page.html +++ b/packages/gp-furo-theme/src/gp_furo_theme/theme/gp-furo/page.html @@ -184,9 +184,32 @@ {%- endif -%}
Source: {{ _docpath }} - {%- if theme_source_branch %} + {%- if theme_source_branch or llms_md_url is defined %} · - Machine-readable: Raw source + Machine-readable: + {%- if llms_md_url is defined %} + Markdown, + {%- endif %} + {%- if theme_source_branch %} + raw source{%- if llms_json_url is defined %},{%- endif %} + {%- endif %} + {%- if llms_json_url is defined %} + docs.json, + llms.txt, + llms-full.txt + {%- endif %} + {%- endif %} +
+ {%- elif llms_md_url is defined or llms_json_url is defined -%} +
+ Machine-readable: + {%- if llms_md_url is defined %} + Markdown{%- if llms_json_url is defined %},{%- endif %} + {%- endif %} + {%- if llms_json_url is defined %} + docs.json, + llms.txt, + llms-full.txt {%- endif %}
{%- endif %} diff --git a/packages/gp-sphinx/src/gp_sphinx/defaults.py b/packages/gp-sphinx/src/gp_sphinx/defaults.py index 5def5531..0810c2db 100644 --- a/packages/gp-sphinx/src/gp_sphinx/defaults.py +++ b/packages/gp-sphinx/src/gp_sphinx/defaults.py @@ -86,6 +86,7 @@ class FontConfig(_FontConfigRequired, total=False): "sphinx_copybutton", "sphinx_gp_opengraph", "sphinx_gp_sitemap", + "sphinx_gp_llms", "sphinxext.rediraffe", "sphinx_design", "myst_parser", @@ -96,7 +97,7 @@ class FontConfig(_FontConfigRequired, total=False): Examples -------- >>> len(DEFAULT_EXTENSIONS) -13 +14 >>> DEFAULT_EXTENSIONS[0] 'sphinx.ext.autodoc' diff --git a/packages/sphinx-gp-llms/README.md b/packages/sphinx-gp-llms/README.md new file mode 100644 index 00000000..99692bc6 --- /dev/null +++ b/packages/sphinx-gp-llms/README.md @@ -0,0 +1,6 @@ +# sphinx-gp-llms + +LLM-friendly documentation outputs for Sphinx. + +Generates `llms.txt`, `llms-full.txt`, `docs.json`, and per-page `.md` +twin files during the standard HTML build. diff --git a/packages/sphinx-gp-llms/pyproject.toml b/packages/sphinx-gp-llms/pyproject.toml new file mode 100644 index 00000000..c7316484 --- /dev/null +++ b/packages/sphinx-gp-llms/pyproject.toml @@ -0,0 +1,40 @@ +[project] +name = "sphinx-gp-llms" +version = "0.0.1a23" +description = "LLM-friendly documentation outputs for Sphinx — llms.txt, llms-full.txt, docs.json, per-page Markdown" +requires-python = ">=3.10,<4.0" +authors = [ + {name = "Tony Narlock", email = "tony@git-pull.com"} +] +license = { text = "MIT" } +classifiers = [ + "Development Status :: 3 - Alpha", + "License :: OSI Approved :: MIT License", + "Framework :: Sphinx", + "Framework :: Sphinx :: Extension", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Topic :: Documentation", + "Topic :: Documentation :: Sphinx", + "Typing :: Typed", +] +readme = "README.md" +keywords = ["sphinx", "llm", "documentation", "ai", "llms-txt"] +dependencies = [ + "sphinx>=8.1", +] + +[project.urls] +Repository = "https://github.com/git-pull/gp-sphinx" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/sphinx_gp_llms"] diff --git a/packages/sphinx-gp-llms/src/sphinx_gp_llms/__init__.py b/packages/sphinx-gp-llms/src/sphinx_gp_llms/__init__.py new file mode 100644 index 00000000..9770b3f4 --- /dev/null +++ b/packages/sphinx-gp-llms/src/sphinx_gp_llms/__init__.py @@ -0,0 +1,218 @@ +"""LLM-friendly documentation outputs for Sphinx. + +Generates ``llms.txt``, ``llms-full.txt``, ``docs.json``, and per-page +``.md`` twin files during the standard HTML build, following conventions +established by llmstxt.org (Jeremy Howard / Answer.AI), Cloudflare +("Markdown for Agents"), Mintlify, and Lakebed (Ping). + +The extension hooks into ``build-finished`` to write output files and +``html-page-context`` to inject footer link variables into the template +context. + +Examples +-------- +>>> from sphinx_gp_llms import setup +>>> callable(setup) +True +""" + +from __future__ import annotations + +import contextlib +import logging +import typing as t + +from sphinx.errors import ExtensionError +from sphinx.util.logging import getLogger + +if t.TYPE_CHECKING: + from docutils import nodes + from sphinx.application import Sphinx + from sphinx.util.typing import ExtensionMetadata + +_EXTENSION_VERSION = "0.0.1a23" + +logger = getLogger(__name__) +logging.getLogger(__name__).addHandler(logging.NullHandler()) + +__all__ = ["setup"] + + +def setup(app: Sphinx) -> ExtensionMetadata: + """Register config values and connect build hooks. + + Parameters + ---------- + app : Sphinx + Sphinx application instance. + + Returns + ------- + ExtensionMetadata + Extension metadata with version and parallel-build flags. + + Examples + -------- + >>> from sphinx_gp_llms import setup + >>> callable(setup) + True + """ + app.add_config_value( + "llms_generate_txt", + default=True, + rebuild="", + types=frozenset({bool}), + description="Enable llms.txt generation.", + ) + app.add_config_value( + "llms_generate_full", + default=True, + rebuild="", + types=frozenset({bool}), + description="Enable llms-full.txt generation.", + ) + app.add_config_value( + "llms_generate_json", + default=True, + rebuild="", + types=frozenset({bool}), + description="Enable docs.json agent manifest generation.", + ) + app.add_config_value( + "llms_generate_md_twins", + default=True, + rebuild="", + types=frozenset({bool}), + description="Enable per-page .md twin file generation.", + ) + app.add_config_value( + "llms_txt_filename", + default="llms.txt", + rebuild="", + types=frozenset({str}), + description="Output filename for the llms.txt index.", + ) + app.add_config_value( + "llms_full_filename", + default="llms-full.txt", + rebuild="", + types=frozenset({str}), + description="Output filename for the concatenated full-content file.", + ) + app.add_config_value( + "llms_json_filename", + default="docs.json", + rebuild="", + types=frozenset({str}), + description="Output filename for the docs.json agent manifest.", + ) + app.add_config_value( + "llms_excludes", + default=[], + rebuild="", + types=frozenset({list}), + description=( + "fnmatch patterns matched against each page's relative URL. " + "Matched pages are excluded from all LLM outputs." + ), + ) + app.add_config_value( + "llms_description_length", + default=200, + rebuild="", + types=frozenset({int}), + description="Maximum character length for page descriptions.", + ) + + with contextlib.suppress(ExtensionError): + app.add_config_value( + "site_url", + default=None, + rebuild="", + types=frozenset({str, type(None)}), + description=( + "Site base URL — registered defensively; " + "sphinx-gp-sitemap usually registers this first." + ), + ) + + app.connect("build-finished", _write_llm_outputs) + app.connect("html-page-context", _inject_llms_context) + + return { + "version": _EXTENSION_VERSION, + "parallel_read_safe": True, + "parallel_write_safe": True, + } + + +def _resolve_site_url(app: Sphinx) -> str | None: + """Resolve site URL from config, normalizing trailing slash.""" + url: str | None = getattr(app.config, "site_url", None) or getattr( + app.config, "html_baseurl", None + ) + if not url: + return None + return url if url.endswith("/") else url + "/" + + +def _write_llm_outputs(app: Sphinx, exception: BaseException | None) -> None: + """Generate all enabled LLM output files at build-finished.""" + if exception is not None: + return + + if not hasattr(app.builder, "get_target_uri"): + return + + site_url = _resolve_site_url(app) + if not site_url: + logger.info( + "sphinx-gp-llms: skipped — site_url and html_baseurl both unset", + type="llms", + subtype="configuration", + ) + return + + if app.config.llms_generate_txt: + from sphinx_gp_llms._llms_txt import write_llms_txt + + write_llms_txt(app, site_url) + + if app.config.llms_generate_full: + from sphinx_gp_llms._llms_full_txt import write_llms_full_txt + + write_llms_full_txt(app, site_url) + + if app.config.llms_generate_json: + from sphinx_gp_llms._docs_json import write_docs_json + + write_docs_json(app, site_url) + + if app.config.llms_generate_md_twins: + from sphinx_gp_llms._md_twins import write_md_twins + + write_md_twins(app) + + +def _inject_llms_context( + app: Sphinx, + pagename: str, + templatename: str, + context: dict[str, t.Any], + doctree: nodes.document | None, +) -> None: + """Add LLM output link variables to the Jinja2 template context.""" + del templatename, doctree + + site_url = _resolve_site_url(app) + if not site_url: + return + + if app.config.llms_generate_md_twins: + context["llms_md_url"] = pagename + ".md" + if app.config.llms_generate_txt: + context["llms_txt_url"] = app.config.llms_txt_filename + if app.config.llms_generate_full: + context["llms_full_url"] = app.config.llms_full_filename + if app.config.llms_generate_json: + context["llms_json_url"] = app.config.llms_json_filename diff --git a/packages/sphinx-gp-llms/src/sphinx_gp_llms/_description.py b/packages/sphinx-gp-llms/src/sphinx_gp_llms/_description.py new file mode 100644 index 00000000..bb6407ca --- /dev/null +++ b/packages/sphinx-gp-llms/src/sphinx_gp_llms/_description.py @@ -0,0 +1,87 @@ +"""First-paragraph extraction from Sphinx doctrees. + +Provides a lightweight description extractor that walks a doctree and +returns the text of the first body paragraph, suitable for use in +``llms.txt`` link descriptions and ``docs.json`` page summaries. + +Examples +-------- +>>> from sphinx_gp_llms._description import get_first_paragraph +>>> callable(get_first_paragraph) +True +""" + +from __future__ import annotations + +import typing as t + +from docutils import nodes + +if t.TYPE_CHECKING: + from sphinx.application import Sphinx + +_SKIP_PARENTS = ( + nodes.Admonition, + nodes.field_list, + nodes.sidebar, + nodes.topic, + nodes.comment, + nodes.footnote, +) + + +def _is_body_paragraph(node: nodes.paragraph) -> bool: + """Return True when *node* is a direct section-child paragraph.""" + parent = node.parent + while parent is not None: + if isinstance(parent, _SKIP_PARENTS): + return False + if isinstance(parent, nodes.section): + return True + parent = parent.parent + return True + + +def get_first_paragraph( + app: Sphinx, + docname: str, + max_length: int = 200, +) -> str: + """Extract the first body paragraph from a page's doctree. + + Parameters + ---------- + app : Sphinx + Sphinx application instance. + docname : str + Document name (without extension). + max_length : int + Maximum characters to return. + + Returns + ------- + str + Flattened paragraph text, truncated with ``...`` when exceeding + *max_length*. + + Examples + -------- + >>> from sphinx_gp_llms._description import get_first_paragraph + >>> callable(get_first_paragraph) + True + """ + doctree = app.env.get_doctree(docname) + title_text = "" + if docname in app.env.titles: + title_text = app.env.titles[docname].astext() + + for node in doctree.findall(nodes.paragraph): + if not _is_body_paragraph(node): + continue + text = node.astext().replace("\n", " ").strip() + if not text or text == title_text: + continue + if len(text) > max_length: + return text[: max_length - 3] + "..." + return text + return "" diff --git a/packages/sphinx-gp-llms/src/sphinx_gp_llms/_docs_json.py b/packages/sphinx-gp-llms/src/sphinx_gp_llms/_docs_json.py new file mode 100644 index 00000000..d9c34397 --- /dev/null +++ b/packages/sphinx-gp-llms/src/sphinx_gp_llms/_docs_json.py @@ -0,0 +1,188 @@ +"""Generate ``docs.json`` — an agent-oriented documentation manifest. + +Follows the agent-manifest convention established by Lakebed (Ping, +``github.com/pingdotgg/span``). The manifest provides structured +metadata including ``agentEntrypoints``, a flat ``pages[]`` array with +per-page ``markdownUrl`` and ``headings[]`` outlines. + +Examples +-------- +>>> from sphinx_gp_llms._docs_json import write_docs_json +>>> callable(write_docs_json) +True +""" + +from __future__ import annotations + +import fnmatch +import json +import pathlib +import typing as t + +from docutils import nodes +from sphinx import addnodes +from sphinx.util.logging import getLogger + +from sphinx_gp_llms._description import get_first_paragraph +from sphinx_gp_llms._toctree import extract_toctree_sections + +if t.TYPE_CHECKING: + from sphinx.application import Sphinx + +logger = getLogger(__name__) + + +class _Heading(t.TypedDict): + id: str + level: int + text: str + + +class _Page(t.TypedDict): + title: str + description: str + section: str + url: str + markdownUrl: str + headings: list[_Heading] + + +class _AgentEntrypoints(t.TypedDict): + manifest: str + llms: str + llmsFull: str + + +class _DocsManifest(t.TypedDict): + name: str + url: str + description: str + sourceRepository: str + agentEntrypoints: _AgentEntrypoints + pages: list[_Page] + + +def write_docs_json(app: Sphinx, site_url: str) -> None: + """Write ``docs.json`` to the build output directory. + + Parameters + ---------- + app : Sphinx + Sphinx application instance. + site_url : str + Normalized site base URL with trailing slash. + + Examples + -------- + >>> from sphinx_gp_llms._docs_json import write_docs_json + >>> callable(write_docs_json) + True + """ + excludes: list[str] = list(app.config.llms_excludes) + sections = extract_toctree_sections(app) + + section_map: dict[str, str] = {} + for section in sections: + caption = section.caption or "Documentation" + for docname in section.docnames: + section_map[docname] = caption + + pages: list[_Page] = [] + for docname in sorted(app.env.found_docs): + uri = app.builder.get_target_uri(docname) + if _is_excluded(uri, excludes): + continue + + title_node = app.env.titles.get(docname) + if title_node is None: + continue + title = title_node.astext() + desc = get_first_paragraph(app, docname, app.config.llms_description_length) + headings = _extract_headings(app, docname) + + pages.append( + _Page( + title=title, + description=desc, + section=section_map.get(docname, ""), + url="/" + uri, + markdownUrl="/" + docname + ".md", + headings=headings, + ) + ) + + source_repo = _get_source_repository(app) + root_desc = get_first_paragraph( + app, app.config.root_doc, app.config.llms_description_length + ) + + manifest = _DocsManifest( + name=app.config.project, + url=site_url.rstrip("/"), + description=root_desc, + sourceRepository=source_repo, + agentEntrypoints=_AgentEntrypoints( + manifest="/" + app.config.llms_json_filename, + llms="/" + app.config.llms_txt_filename, + llmsFull="/" + app.config.llms_full_filename, + ), + pages=pages, + ) + + output = pathlib.Path(app.outdir) / app.config.llms_json_filename + output.write_text( + json.dumps(manifest, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + logger.info( + "sphinx-gp-llms: %s generated at %s", + app.config.llms_json_filename, + output, + type="llms", + subtype="information", + ) + + +def _extract_headings(app: Sphinx, docname: str) -> list[_Heading]: + """Extract heading id/level/text from the table-of-contents tree.""" + toc = app.env.tocs.get(docname) + if toc is None: + return [] + headings: list[_Heading] = [] + _walk_toc(toc, level=1, headings=headings) + return headings + + +def _walk_toc( + node: nodes.Node, + level: int, + headings: list[_Heading], +) -> None: + """Recursively walk a toc bullet_list, collecting headings.""" + if isinstance(node, nodes.bullet_list): + for item in node.children: + _walk_toc(item, level, headings) + elif isinstance(node, nodes.list_item): + for child in node.children: + if isinstance(child, addnodes.compact_paragraph): + for ref in child.findall(nodes.reference): + anchor = ref.get("anchorname", "") + text = ref.astext() + heading_id = anchor.lstrip("#") if anchor else "" + if text: + headings.append(_Heading(id=heading_id, level=level, text=text)) + elif isinstance(child, nodes.bullet_list): + _walk_toc(child, level + 1, headings) + + +def _get_source_repository(app: Sphinx) -> str: + """Read source_repository from theme options.""" + theme_opts = getattr(app.config, "html_theme_options", None) + if isinstance(theme_opts, dict): + return str(theme_opts.get("source_repository", "")) + return "" + + +def _is_excluded(uri: str, patterns: list[str]) -> bool: + """Return True when *uri* matches any fnmatch pattern.""" + return any(fnmatch.fnmatch(uri, p) for p in patterns) diff --git a/packages/sphinx-gp-llms/src/sphinx_gp_llms/_llms_full_txt.py b/packages/sphinx-gp-llms/src/sphinx_gp_llms/_llms_full_txt.py new file mode 100644 index 00000000..261509f5 --- /dev/null +++ b/packages/sphinx-gp-llms/src/sphinx_gp_llms/_llms_full_txt.py @@ -0,0 +1,84 @@ +"""Generate ``llms-full.txt`` — concatenated full-content Markdown. + +Community convention adopted by Anthropic, Cloudflare, Mintlify, and +GitBook. Each page's source content is included under a title header +with a source URL reference, separated by ``---`` dividers. + +Examples +-------- +>>> from sphinx_gp_llms._llms_full_txt import write_llms_full_txt +>>> callable(write_llms_full_txt) +True +""" + +from __future__ import annotations + +import fnmatch +import pathlib +import typing as t + +from sphinx.util.logging import getLogger + +if t.TYPE_CHECKING: + from sphinx.application import Sphinx + +logger = getLogger(__name__) + + +def write_llms_full_txt(app: Sphinx, site_url: str) -> None: + """Write ``llms-full.txt`` to the build output directory. + + Parameters + ---------- + app : Sphinx + Sphinx application instance. + site_url : str + Normalized site base URL with trailing slash. + + Examples + -------- + >>> from sphinx_gp_llms._llms_full_txt import write_llms_full_txt + >>> callable(write_llms_full_txt) + True + """ + excludes: list[str] = list(app.config.llms_excludes) + parts: list[str] = [] + + for docname in sorted(app.env.found_docs): + uri = app.builder.get_target_uri(docname) + if _is_excluded(uri, excludes): + continue + + title_node = app.env.titles.get(docname) + title = title_node.astext() if title_node is not None else docname + url = site_url + uri + source_path = pathlib.Path(app.env.doc2path(docname)) + + parts.append(f"# {title}") + parts.append(f"Source: {url}") + parts.append("") + + try: + content = source_path.read_text(encoding="utf-8") + parts.append(content.rstrip()) + except (OSError, UnicodeDecodeError): + parts.append(f"(source not available for {docname})") + + parts.append("") + parts.append("---") + parts.append("") + + output = pathlib.Path(app.outdir) / app.config.llms_full_filename + output.write_text("\n".join(parts), encoding="utf-8") + logger.info( + "sphinx-gp-llms: %s generated at %s", + app.config.llms_full_filename, + output, + type="llms", + subtype="information", + ) + + +def _is_excluded(uri: str, patterns: list[str]) -> bool: + """Return True when *uri* matches any fnmatch pattern.""" + return any(fnmatch.fnmatch(uri, p) for p in patterns) diff --git a/packages/sphinx-gp-llms/src/sphinx_gp_llms/_llms_txt.py b/packages/sphinx-gp-llms/src/sphinx_gp_llms/_llms_txt.py new file mode 100644 index 00000000..f3601611 --- /dev/null +++ b/packages/sphinx-gp-llms/src/sphinx_gp_llms/_llms_txt.py @@ -0,0 +1,96 @@ +"""Generate ``llms.txt`` — a structured Markdown index for LLM agents. + +Follows the specification at https://llmstxt.org/ (Jeremy Howard, +Answer.AI, September 2024). The file uses H1 for the project name, +a blockquote summary, and H2 sections of bulleted ``[title](url)`` +links grouped by toctree caption. + +Examples +-------- +>>> from sphinx_gp_llms._llms_txt import write_llms_txt +>>> callable(write_llms_txt) +True +""" + +from __future__ import annotations + +import fnmatch +import pathlib +import typing as t + +from sphinx.util.logging import getLogger + +from sphinx_gp_llms._description import get_first_paragraph +from sphinx_gp_llms._toctree import extract_toctree_sections + +if t.TYPE_CHECKING: + from sphinx.application import Sphinx + +logger = getLogger(__name__) + + +def write_llms_txt(app: Sphinx, site_url: str) -> None: + """Write ``llms.txt`` to the build output directory. + + Parameters + ---------- + app : Sphinx + Sphinx application instance. + site_url : str + Normalized site base URL with trailing slash. + + Examples + -------- + >>> from sphinx_gp_llms._llms_txt import write_llms_txt + >>> callable(write_llms_txt) + True + """ + excludes: list[str] = list(app.config.llms_excludes) + sections = extract_toctree_sections(app) + lines: list[str] = [] + + lines.append(f"# {app.config.project}") + lines.append("") + + max_len: int = app.config.llms_description_length + desc = get_first_paragraph(app, app.config.root_doc, max_len) + if desc: + lines.append(f"> {desc}") + lines.append("") + + for section in sections: + section_name = section.caption or "Documentation" + lines.append(f"## {section_name}") + lines.append("") + for docname in section.docnames: + uri = app.builder.get_target_uri(docname) + if _is_excluded(uri, excludes): + continue + title_node = app.env.titles.get(docname) + if title_node is None: + continue + title = title_node.astext() + url = site_url + uri + page_desc = get_first_paragraph( + app, docname, app.config.llms_description_length + ) + entry = f"- [{title}]({url})" + if page_desc: + entry += f": {page_desc}" + lines.append(entry) + lines.append("") + + output = pathlib.Path(app.outdir) / app.config.llms_txt_filename + output.write_text("\n".join(lines), encoding="utf-8") + logger.info( + "sphinx-gp-llms: %s generated at %s", + app.config.llms_txt_filename, + output, + type="llms", + subtype="information", + ) + + +def _is_excluded(uri: str, patterns: list[str]) -> bool: + """Return True when *uri* matches any fnmatch pattern.""" + return any(fnmatch.fnmatch(uri, p) for p in patterns) diff --git a/packages/sphinx-gp-llms/src/sphinx_gp_llms/_md_twins.py b/packages/sphinx-gp-llms/src/sphinx_gp_llms/_md_twins.py new file mode 100644 index 00000000..fb25cb72 --- /dev/null +++ b/packages/sphinx-gp-llms/src/sphinx_gp_llms/_md_twins.py @@ -0,0 +1,72 @@ +"""Generate per-page ``.md`` twin files alongside HTML output. + +Implements the per-page Markdown endpoint convention popularized by +Mintlify, Cloudflare ("Markdown for Agents"), Stripe, and Vercel. +Each HTML page at ``/path/page.html`` gets a Markdown sibling at +``/path/page.md`` containing the original source content. + +Examples +-------- +>>> from sphinx_gp_llms._md_twins import write_md_twins +>>> callable(write_md_twins) +True +""" + +from __future__ import annotations + +import fnmatch +import pathlib +import shutil +import typing as t + +from sphinx.util.logging import getLogger + +if t.TYPE_CHECKING: + from sphinx.application import Sphinx + +logger = getLogger(__name__) + + +def write_md_twins(app: Sphinx) -> None: + """Copy source files as ``.md`` siblings in the build output directory. + + Parameters + ---------- + app : Sphinx + Sphinx application instance. + + Examples + -------- + >>> from sphinx_gp_llms._md_twins import write_md_twins + >>> callable(write_md_twins) + True + """ + excludes: list[str] = list(app.config.llms_excludes) + outdir = pathlib.Path(app.outdir) + count = 0 + + for docname in sorted(app.env.found_docs): + uri = app.builder.get_target_uri(docname) + if _is_excluded(uri, excludes): + continue + + source_path = pathlib.Path(app.env.doc2path(docname)) + if not source_path.exists(): + continue + + target = outdir / (docname + ".md") + target.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(source_path, target) + count += 1 + + logger.info( + "sphinx-gp-llms: %d .md twin files written", + count, + type="llms", + subtype="information", + ) + + +def _is_excluded(uri: str, patterns: list[str]) -> bool: + """Return True when *uri* matches any fnmatch pattern.""" + return any(fnmatch.fnmatch(uri, p) for p in patterns) diff --git a/packages/sphinx-gp-llms/src/sphinx_gp_llms/_toctree.py b/packages/sphinx-gp-llms/src/sphinx_gp_llms/_toctree.py new file mode 100644 index 00000000..f868bd8a --- /dev/null +++ b/packages/sphinx-gp-llms/src/sphinx_gp_llms/_toctree.py @@ -0,0 +1,79 @@ +"""Toctree section extraction for llms.txt grouping. + +Walks the root document's doctree to find ``toctree`` directives and +their ``:caption:`` options, producing a flat list of sections suitable +for the H2-delimited structure of ``llms.txt``. + +Examples +-------- +>>> from sphinx_gp_llms._toctree import ToctreeSection +>>> s = ToctreeSection(caption="Guide", docnames=["quickstart"]) +>>> s.caption +'Guide' +""" + +from __future__ import annotations + +import typing as t + +from sphinx import addnodes + +if t.TYPE_CHECKING: + from sphinx.application import Sphinx + + +class ToctreeSection(t.NamedTuple): + """One section of pages grouped by toctree caption. + + Examples + -------- + >>> ToctreeSection(caption="API", docnames=["api/index"]) + ToctreeSection(caption='API', docnames=['api/index']) + """ + + caption: str | None + docnames: list[str] + + +def extract_toctree_sections(app: Sphinx) -> list[ToctreeSection]: + """Walk the root document's toctree nodes and group pages by caption. + + Parameters + ---------- + app : Sphinx + Sphinx application instance (must have a built environment). + + Returns + ------- + list[ToctreeSection] + Sections in document order. Pages not referenced by any + toctree in the root document get a ``caption=None`` fallback + section at the end. + + Examples + -------- + >>> from sphinx_gp_llms._toctree import extract_toctree_sections + >>> callable(extract_toctree_sections) + True + """ + root_doc = app.config.root_doc + doctree = app.env.get_doctree(root_doc) + + sections: list[ToctreeSection] = [] + assigned: set[str] = set() + + for toctree_node in doctree.findall(addnodes.toctree): + caption = toctree_node.get("caption") + docnames: list[str] = [] + for _title, docname in toctree_node["entries"]: + if docname and docname in app.env.found_docs and docname not in assigned: + docnames.append(docname) + assigned.add(docname) + if docnames: + sections.append(ToctreeSection(caption=caption, docnames=docnames)) + + remaining = sorted(app.env.found_docs - assigned - {root_doc}) + if remaining: + sections.append(ToctreeSection(caption=None, docnames=remaining)) + + return sections diff --git a/packages/sphinx-gp-llms/src/sphinx_gp_llms/py.typed b/packages/sphinx-gp-llms/src/sphinx_gp_llms/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/pyproject.toml b/pyproject.toml index 23621f19..4fea837f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ sphinx-ux-badges = { workspace = true } sphinx-ux-autodoc-layout = { workspace = true } sphinx-gp-opengraph = { workspace = true } sphinx-gp-sitemap = { workspace = true } +sphinx-gp-llms = { workspace = true } gp-furo-theme = { workspace = true } gp-sphinx = { workspace = true } sphinx-vite-builder = { workspace = true } @@ -47,6 +48,7 @@ dev = [ "sphinx-ux-autodoc-layout", "sphinx-gp-opengraph", "sphinx-gp-sitemap", + "sphinx-gp-llms", "gp-furo-theme", "sphinx-vite-builder", # Docs @@ -170,6 +172,7 @@ known-first-party = [ "sphinx_ux_autodoc_layout", "sphinx_gp_opengraph", "sphinx_gp_sitemap", + "sphinx_gp_llms", ] combine-as-imports = true required-imports = [ @@ -219,6 +222,7 @@ testpaths = [ "packages/sphinx-ux-badges/src", "packages/sphinx-gp-opengraph/src", "packages/sphinx-gp-sitemap/src", + "packages/sphinx-gp-llms/src", "packages/sphinx-vite-builder/src", ] filterwarnings = [ diff --git a/scripts/ci/package_tools.py b/scripts/ci/package_tools.py index d591f9ae..8260f283 100644 --- a/scripts/ci/package_tools.py +++ b/scripts/ci/package_tools.py @@ -715,6 +715,24 @@ def smoke_sphinx_gp_sitemap(dist_dir: pathlib.Path, version: str) -> None: ) +def smoke_sphinx_gp_llms(dist_dir: pathlib.Path, version: str) -> None: + """Verify the sphinx-gp-llms extension installs and imports cleanly.""" + with tempfile.TemporaryDirectory() as tmp: + python_path = _create_venv(pathlib.Path(tmp)) + _install_into_venv( + python_path, + *_workspace_wheel_requirements(dist_dir), + ) + _run_python( + python_path, + ( + "import sphinx_gp_llms; " + "from sphinx_gp_llms import setup; " + "assert callable(setup)" + ), + ) + + def smoke_sphinx_autodoc_fastmcp(dist_dir: pathlib.Path, version: str) -> None: """Verify the autodoc-fastmcp extension installs and imports cleanly.""" with tempfile.TemporaryDirectory() as tmp: @@ -844,6 +862,7 @@ def smoke_sphinx_vite_builder(dist_dir: pathlib.Path, version: str) -> None: _PACKAGE_SMOKE_RUNNERS: dict[str, t.Callable[[pathlib.Path, str], None]] = { "sphinx-gp-opengraph": smoke_sphinx_gp_opengraph, "sphinx-gp-sitemap": smoke_sphinx_gp_sitemap, + "sphinx-gp-llms": smoke_sphinx_gp_llms, "gp-sphinx": smoke_gp_sphinx, "sphinx-autodoc-argparse": smoke_sphinx_autodoc_argparse, "sphinx-autodoc-api-style": smoke_sphinx_autodoc_api_style, diff --git a/tests/ext/llms/__init__.py b/tests/ext/llms/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/ext/llms/conftest.py b/tests/ext/llms/conftest.py new file mode 100644 index 00000000..63df4a0f --- /dev/null +++ b/tests/ext/llms/conftest.py @@ -0,0 +1,134 @@ +"""Fixtures for sphinx_gp_llms integration tests.""" + +from __future__ import annotations + +import json +import pathlib +import typing as t + +import pytest + +from tests._sphinx_scenarios import ( + ScenarioFile, + SharedSphinxResult, + SphinxScenario, + build_shared_sphinx_result, +) + +_BASE_CONF = """\ +project = "llms-test" +extensions = ["myst_parser", "sphinx_gp_llms"] +master_doc = "index" +source_suffix = {".md": "markdown"} +exclude_patterns = [] +html_theme = "basic" +site_url = "https://example.org/" +""" + +_INDEX = """\ +# LLMs Test Project + +A test project for LLM documentation outputs. + +```{toctree} +:caption: Guide +:maxdepth: 1 + +quickstart +advanced +``` + +```{toctree} +:caption: Reference +:maxdepth: 1 + +api +``` +""" + +_QUICKSTART = """\ +# Quickstart + +Get started with the project quickly. + +## Installation + +Install via pip. + +## Usage + +Run the main command. +""" + +_ADVANCED = """\ +# Advanced Usage + +Deep-dive into advanced features. +""" + +_API = """\ +# API Reference + +The public API surface. + +## Functions + +Details about functions. + +## Classes + +Details about classes. +""" + + +class LlmsBuildResult(t.NamedTuple): + """Result from an LLM extension build.""" + + result: SharedSphinxResult + llms_txt_path: pathlib.Path + llms_full_path: pathlib.Path + docs_json_path: pathlib.Path + + +@pytest.fixture(scope="module") +def llms_build( + tmp_path_factory: pytest.TempPathFactory, +) -> LlmsBuildResult: + """Build a synthetic Sphinx project with sphinx_gp_llms enabled.""" + cache_root = tmp_path_factory.mktemp("llms-build") + scenario = SphinxScenario( + files=( + ScenarioFile("conf.py", _BASE_CONF), + ScenarioFile("index.md", _INDEX), + ScenarioFile("quickstart.md", _QUICKSTART), + ScenarioFile("advanced.md", _ADVANCED), + ScenarioFile("api.md", _API), + ), + ) + result = build_shared_sphinx_result(cache_root, scenario) + return LlmsBuildResult( + result=result, + llms_txt_path=result.outdir / "llms.txt", + llms_full_path=result.outdir / "llms-full.txt", + docs_json_path=result.outdir / "docs.json", + ) + + +@pytest.fixture(scope="module") +def llms_txt_content(llms_build: LlmsBuildResult) -> str: + """Read llms.txt content.""" + return llms_build.llms_txt_path.read_text(encoding="utf-8") + + +@pytest.fixture(scope="module") +def llms_full_content(llms_build: LlmsBuildResult) -> str: + """Read llms-full.txt content.""" + return llms_build.llms_full_path.read_text(encoding="utf-8") + + +@pytest.fixture(scope="module") +def docs_json_data(llms_build: LlmsBuildResult) -> dict[str, t.Any]: + """Parse docs.json content.""" + return json.loads( # type: ignore[no-any-return] + llms_build.docs_json_path.read_text(encoding="utf-8"), + ) diff --git a/tests/ext/llms/test_docs_json.py b/tests/ext/llms/test_docs_json.py new file mode 100644 index 00000000..5c0e55e2 --- /dev/null +++ b/tests/ext/llms/test_docs_json.py @@ -0,0 +1,89 @@ +"""Integration tests for docs.json generation.""" + +from __future__ import annotations + +import typing as t + +import pytest + +if t.TYPE_CHECKING: + from tests.ext.llms.conftest import LlmsBuildResult + +pytestmark = pytest.mark.integration + + +def test_docs_json_file_exists(llms_build: LlmsBuildResult) -> None: + """docs.json is written to the output directory.""" + assert llms_build.docs_json_path.exists() + + +def test_docs_json_name(docs_json_data: dict[str, t.Any]) -> None: + """docs.json name matches the project name.""" + assert docs_json_data["name"] == "llms-test" + + +def test_docs_json_url(docs_json_data: dict[str, t.Any]) -> None: + """docs.json url is the site URL without trailing slash.""" + assert docs_json_data["url"] == "https://example.org" + + +def test_docs_json_agent_entrypoints(docs_json_data: dict[str, t.Any]) -> None: + """docs.json agentEntrypoints has the expected keys.""" + ep = docs_json_data["agentEntrypoints"] + assert ep["manifest"] == "/docs.json" + assert ep["llms"] == "/llms.txt" + assert ep["llmsFull"] == "/llms-full.txt" + + +def test_docs_json_pages_count(docs_json_data: dict[str, t.Any]) -> None: + """docs.json pages array has an entry per document.""" + pages = docs_json_data["pages"] + page_urls = {p["url"] for p in pages} + assert "/quickstart.html" in page_urls + assert "/api.html" in page_urls + assert "/advanced.html" in page_urls + assert "/index.html" in page_urls + + +class PageFieldCase(t.NamedTuple): + """Test case for docs.json page field presence.""" + + test_id: str + field: str + + +_PAGE_FIELD_CASES: list[PageFieldCase] = [ + PageFieldCase(test_id="has-title", field="title"), + PageFieldCase(test_id="has-description", field="description"), + PageFieldCase(test_id="has-section", field="section"), + PageFieldCase(test_id="has-url", field="url"), + PageFieldCase(test_id="has-markdownUrl", field="markdownUrl"), + PageFieldCase(test_id="has-headings", field="headings"), +] + + +@pytest.mark.parametrize( + list(PageFieldCase._fields), + _PAGE_FIELD_CASES, + ids=[c.test_id for c in _PAGE_FIELD_CASES], +) +def test_docs_json_page_has_field( + test_id: str, + field: str, + docs_json_data: dict[str, t.Any], +) -> None: + """Every page in docs.json has the expected field.""" + for page in docs_json_data["pages"]: + assert field in page, f"page {page.get('url', '?')} missing '{field}'" + + +def test_docs_json_quickstart_headings( + docs_json_data: dict[str, t.Any], +) -> None: + """Quickstart page has extracted headings.""" + pages = docs_json_data["pages"] + qs = next(p for p in pages if p["url"] == "/quickstart.html") + heading_texts = [h["text"] for h in qs["headings"]] + assert "Quickstart" in heading_texts + assert "Installation" in heading_texts + assert "Usage" in heading_texts diff --git a/tests/ext/llms/test_importable.py b/tests/ext/llms/test_importable.py new file mode 100644 index 00000000..31b42f04 --- /dev/null +++ b/tests/ext/llms/test_importable.py @@ -0,0 +1,10 @@ +"""Smoke tests for sphinx_gp_llms importability and metadata.""" + +from __future__ import annotations + +from sphinx_gp_llms import setup + + +def test_setup_callable() -> None: + """setup() is callable and returns extension metadata.""" + assert callable(setup) diff --git a/tests/ext/llms/test_llms_full_txt.py b/tests/ext/llms/test_llms_full_txt.py new file mode 100644 index 00000000..c2d11582 --- /dev/null +++ b/tests/ext/llms/test_llms_full_txt.py @@ -0,0 +1,62 @@ +"""Integration tests for llms-full.txt generation.""" + +from __future__ import annotations + +import typing as t + +import pytest + +if t.TYPE_CHECKING: + from tests.ext.llms.conftest import LlmsBuildResult + +pytestmark = pytest.mark.integration + + +class FullTxtCase(t.NamedTuple): + """Test case for llms-full.txt content assertions.""" + + test_id: str + expected_substring: str + + +_CASES: list[FullTxtCase] = [ + FullTxtCase( + test_id="contains-quickstart-title", + expected_substring="# Quickstart", + ), + FullTxtCase( + test_id="contains-api-title", + expected_substring="# API Reference", + ), + FullTxtCase( + test_id="contains-source-url", + expected_substring="Source: https://example.org/", + ), + FullTxtCase( + test_id="contains-separator", + expected_substring="---", + ), + FullTxtCase( + test_id="contains-quickstart-body", + expected_substring="Get started with the project quickly.", + ), +] + + +@pytest.mark.parametrize( + list(FullTxtCase._fields), + _CASES, + ids=[c.test_id for c in _CASES], +) +def test_llms_full_txt_content( + test_id: str, + expected_substring: str, + llms_full_content: str, +) -> None: + """llms-full.txt contains the expected page content.""" + assert expected_substring in llms_full_content + + +def test_llms_full_txt_file_exists(llms_build: LlmsBuildResult) -> None: + """llms-full.txt is written to the output directory.""" + assert llms_build.llms_full_path.exists() diff --git a/tests/ext/llms/test_llms_txt.py b/tests/ext/llms/test_llms_txt.py new file mode 100644 index 00000000..d717349e --- /dev/null +++ b/tests/ext/llms/test_llms_txt.py @@ -0,0 +1,66 @@ +"""Integration tests for llms.txt generation.""" + +from __future__ import annotations + +import typing as t + +import pytest + +if t.TYPE_CHECKING: + from tests.ext.llms.conftest import LlmsBuildResult + +pytestmark = pytest.mark.integration + + +class LlmsTxtCase(t.NamedTuple): + """Test case for llms.txt content assertions.""" + + test_id: str + expected_substring: str + + +_CASES: list[LlmsTxtCase] = [ + LlmsTxtCase( + test_id="h1-is-project-name", + expected_substring="# llms-test", + ), + LlmsTxtCase( + test_id="blockquote-summary", + expected_substring="> A test project for LLM documentation outputs.", + ), + LlmsTxtCase( + test_id="guide-section-heading", + expected_substring="## Guide", + ), + LlmsTxtCase( + test_id="reference-section-heading", + expected_substring="## Reference", + ), + LlmsTxtCase( + test_id="quickstart-link", + expected_substring="[Quickstart](https://example.org/quickstart.html)", + ), + LlmsTxtCase( + test_id="api-link", + expected_substring="[API Reference](https://example.org/api.html)", + ), +] + + +@pytest.mark.parametrize( + list(LlmsTxtCase._fields), + _CASES, + ids=[c.test_id for c in _CASES], +) +def test_llms_txt_content( + test_id: str, + expected_substring: str, + llms_txt_content: str, +) -> None: + """llms.txt contains the expected structure.""" + assert expected_substring in llms_txt_content + + +def test_llms_txt_file_exists(llms_build: LlmsBuildResult) -> None: + """llms.txt is written to the output directory.""" + assert llms_build.llms_txt_path.exists() diff --git a/tests/ext/llms/test_md_twins.py b/tests/ext/llms/test_md_twins.py new file mode 100644 index 00000000..8a1689ba --- /dev/null +++ b/tests/ext/llms/test_md_twins.py @@ -0,0 +1,52 @@ +"""Integration tests for per-page .md twin file generation.""" + +from __future__ import annotations + +import typing as t + +import pytest + +if t.TYPE_CHECKING: + from tests.ext.llms.conftest import LlmsBuildResult + +pytestmark = pytest.mark.integration + + +class MdTwinCase(t.NamedTuple): + """Test case for .md twin file existence.""" + + test_id: str + docname: str + + +_CASES: list[MdTwinCase] = [ + MdTwinCase(test_id="index", docname="index"), + MdTwinCase(test_id="quickstart", docname="quickstart"), + MdTwinCase(test_id="advanced", docname="advanced"), + MdTwinCase(test_id="api", docname="api"), +] + + +@pytest.mark.parametrize( + list(MdTwinCase._fields), + _CASES, + ids=[c.test_id for c in _CASES], +) +def test_md_twin_exists( + test_id: str, + docname: str, + llms_build: LlmsBuildResult, +) -> None: + """A .md twin file exists alongside each HTML page.""" + md_path = llms_build.result.outdir / f"{docname}.md" + assert md_path.exists(), f"{docname}.md not found in build output" + + +def test_md_twin_content_matches_source( + llms_build: LlmsBuildResult, +) -> None: + """The .md twin content matches the original source file.""" + md_content = (llms_build.result.outdir / "quickstart.md").read_text( + encoding="utf-8", + ) + assert "Get started with the project quickly." in md_content diff --git a/tests/test_package_reference.py b/tests/test_package_reference.py index 8a96c8d6..622cb9fc 100644 --- a/tests/test_package_reference.py +++ b/tests/test_package_reference.py @@ -22,6 +22,7 @@ def test_workspace_packages_lists_publishable_packages() -> None: "gp-furo-theme", "sphinx-gp-opengraph", "sphinx-gp-sitemap", + "sphinx-gp-llms", "gp-sphinx", "sphinx-autodoc-argparse", "sphinx-autodoc-api-style", diff --git a/uv.lock b/uv.lock index 26f640b6..e95cfb6a 100644 --- a/uv.lock +++ b/uv.lock @@ -24,6 +24,7 @@ members = [ "sphinx-autodoc-sphinx", "sphinx-autodoc-typehints-gp", "sphinx-fonts", + "sphinx-gp-llms", "sphinx-gp-opengraph", "sphinx-gp-sitemap", "sphinx-gp-theme", @@ -543,6 +544,7 @@ dev = [ { name = "sphinx-autodoc-fastmcp" }, { name = "sphinx-autodoc-pytest-fixtures" }, { name = "sphinx-autodoc-sphinx" }, + { name = "sphinx-gp-llms" }, { name = "sphinx-gp-opengraph" }, { name = "sphinx-gp-sitemap" }, { name = "sphinx-ux-autodoc-layout" }, @@ -582,6 +584,7 @@ dev = [ { name = "sphinx-autodoc-fastmcp", editable = "packages/sphinx-autodoc-fastmcp" }, { name = "sphinx-autodoc-pytest-fixtures", editable = "packages/sphinx-autodoc-pytest-fixtures" }, { name = "sphinx-autodoc-sphinx", editable = "packages/sphinx-autodoc-sphinx" }, + { name = "sphinx-gp-llms", editable = "packages/sphinx-gp-llms" }, { name = "sphinx-gp-opengraph", editable = "packages/sphinx-gp-opengraph" }, { name = "sphinx-gp-sitemap", editable = "packages/sphinx-gp-sitemap" }, { name = "sphinx-ux-autodoc-layout", editable = "packages/sphinx-ux-autodoc-layout" }, @@ -1823,6 +1826,18 @@ dependencies = [ [package.metadata] requires-dist = [{ name = "sphinx", specifier = ">=8.1" }] +[[package]] +name = "sphinx-gp-llms" +version = "0.0.1a23" +source = { editable = "packages/sphinx-gp-llms" } +dependencies = [ + { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] + +[package.metadata] +requires-dist = [{ name = "sphinx", specifier = ">=8.1" }] + [[package]] name = "sphinx-gp-opengraph" version = "0.0.1a23"