From 838aaf84db8691cd9f442d2fa2c8f9a6f024197b Mon Sep 17 00:00:00 2001 From: mscherer Date: Mon, 9 Feb 2026 10:44:27 +0100 Subject: [PATCH] Fix emphasis delimiters inside link destinations (destination-only) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simpler alternative to PR #80 that only protects delimiters inside link destinations `](...)`, without the full link lookahead. Problem: Emphasis delimiters (`_`, `*`) inside link destinations were matched as closers, breaking link formation: `_[link](http://example.com?foo_bar=1), more text_` → `[link](http://example.com?foobar=1), more text_` Fix: When scanning for emphasis closers in `parseDelimited()`, skip over link destinations by detecting `](` and finding the matching `)`. This prevents delimiters inside URLs from closing emphasis. This is a targeted fix for the common real-world issue (URLs with underscores in query parameters). It intentionally does NOT fix the bracket-text case (`_[foo_](url)`) which remains emphasis - this keeps the implementation simple per discussion in jgm/djot#375. Refs: jgm/djot#375 --- src/Parser/InlineParser.php | 54 +++++ tests/TestCase/Parser/InlineParserTest.php | 245 +++++++++++++++++++++ tests/official/links_and_images.test | 8 +- 3 files changed, 303 insertions(+), 4 deletions(-) diff --git a/src/Parser/InlineParser.php b/src/Parser/InlineParser.php index e72b4d3..8a9a5d8 100644 --- a/src/Parser/InlineParser.php +++ b/src/Parser/InlineParser.php @@ -1047,6 +1047,18 @@ protected function parseDelimited(string $text, int $pos, string $delimiter, str } } + // Skip over link destinations ](...) + // This prevents emphasis delimiters inside URLs from closing emphasis + // that started before the link. e.g. _[link](url_bar)_ should work. + if ($char === ']' && $searchPos + 1 < $length && $text[$searchPos + 1] === '(') { + $destEnd = $this->findLinkDestinationEnd($text, $searchPos + 1); + if ($destEnd !== null) { + $searchPos = $destEnd; + + continue; + } + } + // Skip escape sequences if ($char === '\\' && $searchPos + 1 < $length) { $searchPos += 2; @@ -1618,6 +1630,48 @@ protected function findCodeSpanEnd(string $text, int $pos): ?int return null; } + /** + * Find the end of a link destination starting at $pos (which points to '('). + * + * This is a simpler version that only handles the destination part, + * not the full link syntax. Used to skip over URL content when scanning + * for emphasis closers. + * + * @return int|null Position after the closing ), or null if not found + */ + protected function findLinkDestinationEnd(string $text, int $pos): ?int + { + $length = strlen($text); + if ($pos >= $length || $text[$pos] !== '(') { + return null; + } + + $parenDepth = 1; + $i = $pos + 1; + + while ($i < $length && $parenDepth > 0) { + $char = $text[$i]; + if ($char === '(') { + $parenDepth++; + } elseif ($char === ')') { + $parenDepth--; + } elseif ($char === '\\' && $i + 1 < $length) { + // Skip escaped character + $i++; + } + if ($parenDepth > 0) { + $i++; + } + } + + if ($parenDepth !== 0) { + return null; + } + + // Return position after the closing ) + return $i + 1; + } + /** * Find the end of an autolink starting at $pos * diff --git a/tests/TestCase/Parser/InlineParserTest.php b/tests/TestCase/Parser/InlineParserTest.php index f274819..99a7d34 100644 --- a/tests/TestCase/Parser/InlineParserTest.php +++ b/tests/TestCase/Parser/InlineParserTest.php @@ -377,6 +377,251 @@ public function testAutolinkPrecedenceOverEmphasis(): void $this->assertSame('http://example.com/a_b', $children[1]->getDestination()); } + /** + * Test: Underscore in link URL should not break emphasis + * + * This is the main issue from https://github.com/jgm/djot/issues/375 + * _[link](http://example.com?foo_bar=1), more text_ + * Should produce emphasis around the entire content, with a working link. + */ + public function testEmphasisWithUnderscoreInLinkDestination(): void + { + $para = $this->parseInline('_[link](http://example.com?foo_bar=1), more text_'); + + $children = $para->getChildren(); + $this->assertCount(1, $children); + $this->assertInstanceOf(Emphasis::class, $children[0]); + + $emChildren = $children[0]->getChildren(); + // Should contain a link node followed by text + $this->assertInstanceOf(Link::class, $emChildren[0]); + $this->assertSame('http://example.com?foo_bar=1', $emChildren[0]->getDestination()); + + // Verify link text + $linkChildren = $emChildren[0]->getChildren(); + $this->assertInstanceOf(Text::class, $linkChildren[0]); + $this->assertSame('link', $linkChildren[0]->getContent()); + } + + /** + * Test: Simple underscore in path should not break emphasis + */ + public function testEmphasisWithUnderscoreInSimplePath(): void + { + $para = $this->parseInline('_hello [link](a_b) world_'); + + $children = $para->getChildren(); + $this->assertCount(1, $children); + $this->assertInstanceOf(Emphasis::class, $children[0]); + + $emChildren = $children[0]->getChildren(); + $this->assertInstanceOf(Text::class, $emChildren[0]); + $this->assertSame('hello ', $emChildren[0]->getContent()); + $this->assertInstanceOf(Link::class, $emChildren[1]); + $this->assertSame('a_b', $emChildren[1]->getDestination()); + $this->assertInstanceOf(Text::class, $emChildren[2]); + $this->assertSame(' world', $emChildren[2]->getContent()); + } + + /** + * Test: Star in link URL should not break strong + */ + public function testStrongWithStarInLinkDestination(): void + { + $para = $this->parseInline('*[link](http://example.com?q=a*b) text*'); + + $children = $para->getChildren(); + $this->assertCount(1, $children); + $this->assertInstanceOf(Strong::class, $children[0]); + + $strongChildren = $children[0]->getChildren(); + $this->assertInstanceOf(Link::class, $strongChildren[0]); + $this->assertSame('http://example.com?q=a*b', $strongChildren[0]->getDestination()); + } + + /** + * Test: Multiple underscores in URL + */ + public function testEmphasisWithMultipleUnderscoresInDestination(): void + { + $para = $this->parseInline('_[link](path/to_file_name_here)_'); + + $children = $para->getChildren(); + $this->assertCount(1, $children); + $this->assertInstanceOf(Emphasis::class, $children[0]); + + $emChildren = $children[0]->getChildren(); + $this->assertInstanceOf(Link::class, $emChildren[0]); + $this->assertSame('path/to_file_name_here', $emChildren[0]->getDestination()); + } + + /** + * Test: Nested parentheses in URL should be handled correctly + */ + public function testEmphasisWithNestedParensInDestination(): void + { + $para = $this->parseInline('_[wiki](http://en.wikipedia.org/wiki/Foo_(bar))_'); + + $children = $para->getChildren(); + $this->assertCount(1, $children); + $this->assertInstanceOf(Emphasis::class, $children[0]); + + $emChildren = $children[0]->getChildren(); + $this->assertInstanceOf(Link::class, $emChildren[0]); + $this->assertSame('http://en.wikipedia.org/wiki/Foo_(bar)', $emChildren[0]->getDestination()); + } + + /** + * Test: Escaped underscore in URL + */ + public function testEmphasisWithEscapedUnderscoreInDestination(): void + { + $para = $this->parseInline('_[link](path/to\_file)_'); + + $children = $para->getChildren(); + $this->assertCount(1, $children); + $this->assertInstanceOf(Emphasis::class, $children[0]); + + $emChildren = $children[0]->getChildren(); + $this->assertInstanceOf(Link::class, $emChildren[0]); + // Escape is processed, so _ becomes literal + $this->assertSame('path/to_file', $emChildren[0]->getDestination()); + } + + /** + * Test: Image with underscore in URL should also work + */ + public function testEmphasisWithUnderscoreInImageDestination(): void + { + $para = $this->parseInline('_![alt](image_file.png)_'); + + $children = $para->getChildren(); + $this->assertCount(1, $children); + $this->assertInstanceOf(Emphasis::class, $children[0]); + + $emChildren = $children[0]->getChildren(); + $this->assertInstanceOf(Image::class, $emChildren[0]); + $this->assertSame('image_file.png', $emChildren[0]->getSource()); + } + + /** + * Test: Multiple links with underscores in URLs + */ + public function testEmphasisWithMultipleLinksWithUnderscores(): void + { + $para = $this->parseInline('_[a](x_y) and [b](p_q)_'); + + $children = $para->getChildren(); + $this->assertCount(1, $children); + $this->assertInstanceOf(Emphasis::class, $children[0]); + + $emChildren = $children[0]->getChildren(); + $this->assertInstanceOf(Link::class, $emChildren[0]); + $this->assertSame('x_y', $emChildren[0]->getDestination()); + $this->assertInstanceOf(Text::class, $emChildren[1]); + $this->assertSame(' and ', $emChildren[1]->getContent()); + $this->assertInstanceOf(Link::class, $emChildren[2]); + $this->assertSame('p_q', $emChildren[2]->getDestination()); + } + + /** + * Test: Underscore in link text still triggers emphasis (bracket-text case) + * + * This is intentionally NOT fixed by the destination-only approach. + * _[foo_](url) should still produce emphasis [foo then ](url) as text. + */ + public function testUnderscoreInLinkTextStillTriggersEmphasis(): void + { + $para = $this->parseInline('_[bar_](url)'); + + $children = $para->getChildren(); + // The _ inside [bar_] closes the emphasis started at the beginning + // Result: [bar](url) + $this->assertInstanceOf(Emphasis::class, $children[0]); + + $emChildren = $children[0]->getChildren(); + $this->assertInstanceOf(Text::class, $emChildren[0]); + $this->assertSame('[bar', $emChildren[0]->getContent()); + + // Remaining text after emphasis + $this->assertInstanceOf(Text::class, $children[1]); + $this->assertSame('](url)', $children[1]->getContent()); + } + + /** + * Test: Unclosed link destination doesn't break emphasis + */ + public function testEmphasisWithUnclosedLinkDestination(): void + { + // [foo](_bar is not a complete link, so emphasis should still work + $para = $this->parseInline('_text [foo](_bar more_'); + + $children = $para->getChildren(); + // This is complex - the [foo]( triggers unclosed link handling + // The emphasis should close on the last _ + $this->assertInstanceOf(Emphasis::class, $children[0]); + } + + /** + * Test: Link destination without preceding bracket should not affect emphasis + */ + public function testEmphasisWithParensNotPrecededByBracket(): void + { + // Just (a_b) without [] before it - underscore should close emphasis + $para = $this->parseInline('_foo (a_b) bar_'); + + $children = $para->getChildren(); + // The _ in (a_b) closes emphasis because there's no ]( pattern + $this->assertInstanceOf(Emphasis::class, $children[0]); + $emChildren = $children[0]->getChildren(); + $this->assertInstanceOf(Text::class, $emChildren[0]); + $this->assertSame('foo (a', $emChildren[0]->getContent()); + } + + /** + * Test: Complex real-world URL with query params and underscores + */ + public function testEmphasisWithComplexQueryString(): void + { + $para = $this->parseInline('_Check [this API](https://api.example.com/v1/users?sort_by=name&filter_type=active) for details_'); + + $children = $para->getChildren(); + $this->assertCount(1, $children); + $this->assertInstanceOf(Emphasis::class, $children[0]); + + $emChildren = $children[0]->getChildren(); + $found = false; + foreach ($emChildren as $child) { + if ($child instanceof Link) { + $this->assertSame('https://api.example.com/v1/users?sort_by=name&filter_type=active', $child->getDestination()); + $found = true; + } + } + $this->assertTrue($found, 'Link should be found within emphasis'); + } + + /** + * Test: Strong (star) with complex URL + */ + public function testStrongWithComplexUrl(): void + { + $para = $this->parseInline('*Visit [the page](http://example.com/path*with*stars) now*'); + + $children = $para->getChildren(); + $this->assertCount(1, $children); + $this->assertInstanceOf(Strong::class, $children[0]); + + $strongChildren = $children[0]->getChildren(); + $foundLink = false; + foreach ($strongChildren as $child) { + if ($child instanceof Link) { + $this->assertSame('http://example.com/path*with*stars', $child->getDestination()); + $foundLink = true; + } + } + $this->assertTrue($foundLink, 'Link should be found within strong'); + } + public function testEmphasisFollowedByCloseBrace(): void { // Emphasis opener cannot be followed by } (closer marker) diff --git a/tests/official/links_and_images.test b/tests/official/links_and_images.test index f14a58f..d92ef56 100644 --- a/tests/official/links_and_images.test +++ b/tests/official/links_and_images.test @@ -174,15 +174,15 @@ b*)

closed

``` -Here the strong takes precedence over the link because it -starts first: +Here the link takes precedence because the star inside the +destination is protected from closing emphasis: ``` *[closed](hello*) . -

[closed](hello)

+

*closed

``` -Avoid this with a backslash escape: +This also works with a backslash escape: ``` *[closed](hello\*) .