Skip to content

Commit 09b3057

Browse files
heiskrCopilot
andauthored
Replace Cheerio in mini-TOC with AST-based rehype plugin (#60548)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 4c0b0b5 commit 09b3057

File tree

6 files changed

+308
-129
lines changed

6 files changed

+308
-129
lines changed
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import { describe, expect, test } from 'vitest'
2+
3+
import { renderContent } from '@/content-render/index'
4+
import type { CollectedHeading } from '@/frame/lib/get-mini-toc-items'
5+
import type { Context } from '@/types'
6+
7+
describe('collect-mini-toc rehype plugin', () => {
8+
test('collects h2 and h3 headings with href and title', async () => {
9+
const collectMiniToc: CollectedHeading[] = []
10+
await renderContent('## Section one\n\n### Subsection\n\n## Section two', {
11+
collectMiniToc,
12+
} as Context)
13+
expect(collectMiniToc).toHaveLength(3)
14+
expect(collectMiniToc[0]).toMatchObject({
15+
title: 'Section one',
16+
href: '#section-one',
17+
headingLevel: 2,
18+
})
19+
expect(collectMiniToc[1]).toMatchObject({
20+
title: 'Subsection',
21+
headingLevel: 3,
22+
})
23+
expect(collectMiniToc[2]).toMatchObject({
24+
title: 'Section two',
25+
headingLevel: 2,
26+
})
27+
})
28+
29+
test('skips headings inside hidden ancestors', async () => {
30+
const collectMiniToc: CollectedHeading[] = []
31+
const md = [
32+
'## Visible heading',
33+
'',
34+
'<div hidden>',
35+
'',
36+
'## Hidden heading',
37+
'',
38+
'</div>',
39+
'',
40+
'## Another visible',
41+
].join('\n')
42+
await renderContent(md, { collectMiniToc } as Context)
43+
const titles = collectMiniToc.map((h) => h.title)
44+
expect(titles).toContain('Visible heading')
45+
expect(titles).toContain('Another visible')
46+
expect(titles).not.toContain('Hidden heading')
47+
})
48+
49+
test('detects ghd-tool platform class', async () => {
50+
const collectMiniToc: CollectedHeading[] = []
51+
const md = ['<div class="ghd-tool mac">', '', '## Mac instructions', '', '</div>'].join('\n')
52+
await renderContent(md, { collectMiniToc } as Context)
53+
expect(collectMiniToc).toHaveLength(1)
54+
expect(collectMiniToc[0].platform).toBe('ghd-tool mac')
55+
})
56+
57+
test('returns empty platform when no ghd-tool wrapper', async () => {
58+
const collectMiniToc: CollectedHeading[] = []
59+
await renderContent('## Plain heading', { collectMiniToc } as Context)
60+
expect(collectMiniToc).toHaveLength(1)
61+
expect(collectMiniToc[0].platform).toBe('')
62+
})
63+
64+
test('does not collect when collectMiniToc is not provided', async () => {
65+
// Should not throw — plugin is a no-op without collectInto
66+
const result = await renderContent('## Heading')
67+
expect(result).toContain('Heading')
68+
})
69+
})
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import { visitParents } from 'unist-util-visit-parents'
2+
import { toString } from 'hast-util-to-string'
3+
import type { Plugin } from 'unified'
4+
import type { Root, Element, ElementContent } from 'hast'
5+
import type { CollectedHeading } from '@/frame/lib/get-mini-toc-items'
6+
7+
interface CollectMiniTocOptions {
8+
collectInto?: CollectedHeading[]
9+
}
10+
11+
function hasClassName(el: Element, name: string): boolean {
12+
const cls = el.properties?.className
13+
if (Array.isArray(cls)) return cls.includes(name)
14+
if (typeof cls === 'string') return cls.split(/\s+/).includes(name)
15+
return false
16+
}
17+
18+
function getClassString(el: Element): string {
19+
const cls = el.properties?.className
20+
if (Array.isArray(cls)) return cls.join(' ')
21+
if (typeof cls === 'string') return cls
22+
return ''
23+
}
24+
25+
// Rehype plugin that collects heading data (href, title, level, platform)
26+
// during rendering, so callers don't need to re-parse the HTML.
27+
// Place this after heading-links in the processor chain.
28+
const collectMiniToc: Plugin<[CollectMiniTocOptions], Root> = ({ collectInto }) => {
29+
if (!collectInto) return
30+
31+
return (tree: Root) => {
32+
visitParents(tree, 'element', (node, ancestors) => {
33+
const el = node as Element
34+
if (!/^h[1-6]$/.test(el.tagName)) return
35+
if (!el.properties?.id) return
36+
37+
// Skip headings inside hidden ancestors
38+
for (const anc of ancestors) {
39+
if (anc.type === 'element') {
40+
const ancEl = anc as Element
41+
if (ancEl.properties?.hidden === true) return
42+
}
43+
}
44+
45+
const headingLevel = parseInt(el.tagName.charAt(1), 10)
46+
47+
// Find the anchor child that heading-links.ts created
48+
const anchor = el.children.find(
49+
(child): child is Element =>
50+
child.type === 'element' && child.tagName === 'a' && hasClassName(child, 'heading-link'),
51+
)
52+
if (!anchor) return
53+
54+
const href = anchor.properties?.href as string | undefined
55+
if (!href) return
56+
57+
// Filter out direct-child <span> elements (and their content).
58+
// heading-links.ts always inserts the heading-link-symbol span as a
59+
// direct child of the anchor, so filtering direct children is sufficient.
60+
const textChildren = (anchor.children || []).filter(
61+
(child: ElementContent) =>
62+
!(child.type === 'element' && (child as Element).tagName === 'span'),
63+
)
64+
65+
const title = textChildren.map((child) => toString(child)).join('')
66+
67+
// Detect platform from ghd-tool ancestor
68+
let platform = ''
69+
for (const anc of ancestors) {
70+
if (anc.type === 'element' && hasClassName(anc as Element, 'ghd-tool')) {
71+
platform = getClassString(anc as Element)
72+
break
73+
}
74+
}
75+
76+
collectInto.push({ href, title: title.trim(), headingLevel, platform })
77+
})
78+
}
79+
}
80+
81+
export default collectMiniToc

src/content-render/unified/processor.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import rewriteImgSources from './rewrite-asset-urls'
2020
import rewriteAssetImgTags from './rewrite-asset-img-tags'
2121
import useEnglishHeadings from './use-english-headings'
2222
import headingLinks from './heading-links'
23+
import collectMiniToc from './collect-mini-toc'
2324
import rewriteTheadThScope from './rewrite-thead-th-scope'
2425
import rewriteEmptyTableRows from './rewrite-empty-table-rows'
2526
import rewriteForRowheaders from './rewrite-for-rowheaders'
@@ -31,6 +32,7 @@ import alerts from './alerts'
3132
import removeHtmlComments from 'remark-remove-comments'
3233
import remarkStringify from 'remark-stringify'
3334
import type { Context, UnifiedProcessor } from '@/content-render/types'
35+
import type { CollectedHeading } from '@/frame/lib/get-mini-toc-items'
3436

3537
export function createProcessor(context: Context): UnifiedProcessor {
3638
return (
@@ -74,6 +76,9 @@ export function createProcessor(context: Context): UnifiedProcessor {
7476
},
7577
})
7678
.use(raw)
79+
.use(collectMiniToc, {
80+
collectInto: context.collectMiniToc as CollectedHeading[] | undefined,
81+
})
7782
.use(wrapProceduralImages)
7883
.use(rewriteEmptyTableRows)
7984
.use(rewriteTheadThScope)

src/frame/lib/get-mini-toc-items.ts

Lines changed: 37 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1-
import { load } from 'cheerio'
2-
import type { Element } from 'domhandler'
3-
import { range } from 'lodash-es'
4-
51
import { renderContent } from '@/content-render/index'
62
import type { Context } from '@/types'
73

4+
export interface CollectedHeading {
5+
href: string
6+
title: string
7+
headingLevel: number
8+
platform: string
9+
}
10+
811
interface MiniTocContents {
912
href: string
1013
title: string
@@ -24,81 +27,38 @@ interface FlatTocItem {
2427
items?: FlatTocItem[]
2528
}
2629

27-
// Keep maxHeadingLevel=2 for accessibility reasons, see docs-engineering#2701 for more info
28-
export default function getMiniTocItems(
29-
html: string,
30+
// Build MiniTocItems from pre-collected heading data (from the collect-mini-toc
31+
// rehype plugin). This is the only path for generating mini-TOC items — headings
32+
// are collected directly from the AST during rendering, avoiding any HTML
33+
// re-parsing.
34+
// Keep maxHeadingLevel=2 for accessibility reasons, see docs-engineering#2701
35+
export function buildMiniTocFromCollected(
36+
collected: CollectedHeading[],
3037
maxHeadingLevel = 2,
31-
headingScope = '',
3238
): MiniTocItem[] {
33-
const $ = load(html, { xmlMode: true })
34-
35-
// eg `h2, h3` or `h2, h3, h4` depending on maxHeadingLevel
36-
const selector = range(2, maxHeadingLevel + 1)
37-
.map((num) => `${headingScope} h${num}`)
38-
.join(', ')
39-
const headings = $(selector)
40-
41-
// return an array of objects containing each heading's contents, level, and optional platform.
42-
// Article layout uses these as follows:
43-
// - `title` and `link` to render the mini TOC headings
44-
// - `headingLevel` the `2` in `h2`; used for determining required indentation
45-
// - `platform` to show or hide platform-specific headings via client JS
39+
const effectiveMax = maxHeadingLevel > 0 ? maxHeadingLevel : 2
40+
const headings = collected.filter((h) => h.headingLevel >= 2 && h.headingLevel <= effectiveMax)
4641

47-
// H1 = highest importance, H6 = lowest importance
4842
let mostImportantHeadingLevel: number | undefined
49-
const flatToc = headings
50-
.get()
51-
.filter((item) => {
52-
const parent = item.parent as Element | null
53-
if (!parent || !parent.attribs) return true
54-
const { attribs } = parent
55-
return !('hidden' in attribs)
56-
})
57-
.map((item) => {
58-
// remove any <span> tags including their content
59-
$('span', item).remove()
60-
61-
// Capture the anchor tag nested within the header, get its href and remove it
62-
const anchor = $('a.heading-link', item)
63-
const href = anchor.attr('href')
64-
if (!href) {
65-
// Can happen if the, for example, `<h2>` tag was put there
66-
// manually with HTML into the Markdown content. Then it wouldn't
67-
// be rendered with an expected `<a class="heading-link" href="#..."`
68-
// link in front of it.
69-
// The `return null` will be filtered after the `.map()`
70-
return null
71-
}
72-
73-
// remove any <strong> tags but leave content
74-
$('strong', item).map((i, el) => $(el).replaceWith($(el).contents()))
7543

76-
const contents: MiniTocContents = { href, title: $(item).text().trim() }
77-
const element = $(item)[0] as Element
78-
const headingLevel = parseInt(element.name.match(/\d+/)![0], 10) || 0 // the `2` from `h2`
79-
80-
const platform = $(item).parent('.ghd-tool').attr('class') || ''
81-
82-
// track the most important heading level while we're looping through the items
83-
if (headingLevel < mostImportantHeadingLevel! || mostImportantHeadingLevel === undefined) {
84-
mostImportantHeadingLevel = headingLevel
85-
}
44+
const flatToc: FlatTocItem[] = headings.map((h) => {
45+
if (mostImportantHeadingLevel === undefined || h.headingLevel < mostImportantHeadingLevel) {
46+
mostImportantHeadingLevel = h.headingLevel
47+
}
48+
return {
49+
contents: { href: h.href, title: h.title },
50+
headingLevel: h.headingLevel,
51+
platform: h.platform,
52+
indentationLevel: 0,
53+
}
54+
})
8655

87-
return { contents, headingLevel, platform }
88-
})
89-
.filter(Boolean)
90-
.map((item) => {
91-
// set the indentation level for each item based on the most important
92-
// heading level in the current article
93-
return {
94-
...item!,
95-
indentationLevel: item!.headingLevel - mostImportantHeadingLevel!,
96-
}
97-
})
56+
// Set indentation relative to the most important heading
57+
for (const item of flatToc) {
58+
item.indentationLevel = item.headingLevel - (mostImportantHeadingLevel ?? item.headingLevel)
59+
}
9860

99-
// convert the flatToc to a nested structure to simplify semantic rendering on the client
10061
const nestedToc = buildNestedToc(flatToc)
101-
10262
return minimalMiniToc(nestedToc)
10363
}
10464

@@ -179,6 +139,10 @@ export async function getAutomatedPageMiniTocItems(
179139
})
180140
.join('')
181141

182-
const toc = await renderContent(titles, context)
183-
return getMiniTocItems(toc, depth, '')
142+
// Collect headings during render via the rehype plugin
143+
const collectMiniToc: CollectedHeading[] = []
144+
const renderContext = { ...context, collectMiniToc }
145+
await renderContent(titles, renderContext)
146+
147+
return buildMiniTocFromCollected(collectMiniToc, depth)
184148
}

src/frame/middleware/render-page.ts

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import type { Response } from 'express'
33
import type { Failbot } from '@github/failbot'
44
import { get } from 'lodash-es'
55

6-
import getMiniTocItems from '@/frame/lib/get-mini-toc-items'
6+
import { buildMiniTocFromCollected, type CollectedHeading } from '@/frame/lib/get-mini-toc-items'
77
import patterns from '@/frame/lib/patterns'
88
import FailBot from '@/observability/lib/failbot'
99
import statsd from '@/observability/lib/statsd'
@@ -24,6 +24,13 @@ async function buildRenderedPage(req: ExtendedRequest): Promise<string> {
2424
if (!page) throw new Error('page not set in context')
2525
const path = req.pagePath || req.path
2626

27+
// Set up collection array for the collect-mini-toc rehype plugin only when
28+
// the page actually needs a mini-TOC, avoiding unnecessary work.
29+
if (page.showMiniToc) {
30+
const collectMiniToc: CollectedHeading[] = []
31+
context.collectMiniToc = collectMiniToc
32+
}
33+
2734
const pageRenderTimed = statsd.asyncTimer(page.render, STATSD_KEY_RENDER, [`path:${path}`])
2835

2936
return (await pageRenderTimed(context)) as string
@@ -39,7 +46,11 @@ function buildMiniTocItems(req: ExtendedRequest) {
3946
return
4047
}
4148

42-
return getMiniTocItems(context.renderedPage || '', 0)
49+
// Use headings collected during rendering via the collect-mini-toc rehype plugin.
50+
const collected = context.collectMiniToc as CollectedHeading[] | undefined
51+
if (collected) {
52+
return buildMiniTocFromCollected(collected, 2)
53+
}
4354
}
4455

4556
export default async function renderPage(req: ExtendedRequest, res: Response) {

0 commit comments

Comments
 (0)