From 7c3c644746daaad546a89079cc503ff475ebd6e6 Mon Sep 17 00:00:00 2001 From: Will Fuqua Date: Sun, 7 Jun 2026 14:09:22 +0700 Subject: [PATCH] Handle spacing grapheme extenders U+FF9E-U+FF9F --- src/PrettyPrompt/Rendering/UnicodeWidth.cs | 40 ++++++++++++++----- tests/PrettyPrompt.Tests/GraphemeTests.cs | 20 ++++++++++ tests/PrettyPrompt.Tests/ScreenTests.cs | 6 +++ tests/PrettyPrompt.Tests/UnicodeWidthTests.cs | 16 ++++++++ 4 files changed, 71 insertions(+), 11 deletions(-) diff --git a/src/PrettyPrompt/Rendering/UnicodeWidth.cs b/src/PrettyPrompt/Rendering/UnicodeWidth.cs index 75f3e26..d122e62 100644 --- a/src/PrettyPrompt/Rendering/UnicodeWidth.cs +++ b/src/PrettyPrompt/Rendering/UnicodeWidth.cs @@ -15,17 +15,21 @@ namespace PrettyPrompt.Rendering; /// /// Calculates how many terminal columns ("cells") a character or string occupies. /// -/// /// Per-scalar widths come from — a vendored, source-only copy of -/// https://github.com/spectreconsole/wcwidth (Unicode 16). PrettyPrompt layers grapheme-cluster -/// awareness on top: a cluster (an emoji ZWJ sequence such as "🤦🏼‍♂️", or a base character followed -/// by combining marks / a variation selector) occupies the width of its base scalar value — the -/// trailing scalars modify the glyph but add no columns. Each cluster's width is capped at 2, because -/// the renderer models every cell as one or two columns wide (see ); without the cap, -/// summing the parts of a cluster (e.g. base + skin-tone modifier = 2 + 2) produced widths of 3-5 and -/// crashed cursor positioning. See https://github.com/waf/PrettyPrompt/issues/270. -/// +/// https://github.com/spectreconsole/wcwidth (Unicode 16). On top of that PrettyPrompt adds grapheme-cluster +/// awareness, so a multi-scalar cluster (an emoji ZWJ sequence such as "🤦🏼‍♂️", or a base character followed +/// by combining marks / a variation selector) is sized as the single glyph the terminal draws. The rules: +/// +/// - A cluster occupies the width of its base scalar value. Trailing combining marks, zero-width joiners, +/// emoji modifiers (e.g. skin tones), and variation selectors shape the glyph but add no columns. +/// - Exception: a halfwidth katakana voiced / semi-voiced sound mark (U+FF9E, U+FF9F) is a grapheme extender +/// too, but it is a spacing mark that takes its own halfwidth cell rather than overlaying the base, +/// so it adds a column — e.g. "パ" is one cluster but two columns. +/// - Every cluster's width is capped at 2, because the renderer models each cell as one or two columns +/// wide (see ). Without the cap, summing a cluster's parts (e.g. base + skin-tone modifier +/// = 2 + 2) produced widths of 3-5 and crashed cursor positioning. /// +/// See https://github.com/waf/PrettyPrompt/issues/270 public static class UnicodeWidth { /// @@ -68,7 +72,7 @@ public static int GetWidth(ReadOnlySpan text) public static int GetGraphemeClusterWidth(ReadOnlySpan cluster) { if (cluster.IsEmpty) return 0; - if (Rune.DecodeFromUtf16(cluster, out var baseRune, out _) != OperationStatus.Done) + if (Rune.DecodeFromUtf16(cluster, out var baseRune, out int baseLength) != OperationStatus.Done) { return 1; // ill-formed (e.g. a lone surrogate); be defensive and reserve a single column. } @@ -77,7 +81,21 @@ public static int GetGraphemeClusterWidth(ReadOnlySpan cluster) // that defaults to 1 - e.g. ⚠ (U+26A0) is 1 column but ⚠️ is a 2-column emoji; wcwidth misses this. // The length check skips a lone, base-less selector (no width of its own). if (cluster.Length > 1 && cluster.Contains((char)0xFE0F)) return 2; - return Clamp(UnicodeCalculator.GetWidth(baseRune)); + + int width = Clamp(UnicodeCalculator.GetWidth(baseRune)); + + // Halfwidth katakana voiced / semi-voiced sound marks (U+FF9E ゙, U+FF9F ゚) are SPACING grapheme + // extenders: StringInfo clusters each onto the preceding kana, but unlike a combining mark that + // overlays its base they render in their own halfwidth cell (category Lm, wcwidth 1). So e.g. "パ" + // (U+FF8A U+FF9F) is one cluster but occupies two columns. Add a column per trailing mark, which also + // keeps this in step with the per-char GetWidth path (it already counts them). The base path above + // handles a lone, base-less mark. See https://github.com/microsoft/terminal/issues/18087. + foreach (var c in cluster.Slice(baseLength)) + { + if (c is (char)0xFF9E or (char)0xFF9F) width++; + } + + return Math.Min(width, 2); // cap at the cell model's two columns (see remarks) } /// diff --git a/tests/PrettyPrompt.Tests/GraphemeTests.cs b/tests/PrettyPrompt.Tests/GraphemeTests.cs index 0521d4f..123c168 100644 --- a/tests/PrettyPrompt.Tests/GraphemeTests.cs +++ b/tests/PrettyPrompt.Tests/GraphemeTests.cs @@ -41,4 +41,24 @@ public void PreviousBoundary_StepsOverWholeCluster(int index, int expected) [InlineData(9, 9)] public void RoundDownToBoundary_SnapsMidClusterIndices(int index, int expected) => Assert.Equal(expected, Grapheme.RoundDownToBoundary(Text, index)); + + // "パグ" = パグ ("pug") in halfwidth katakana = U+FF8A U+FF9F U+FF78 U+FF9E. The voiced / semi-voiced + // sound marks (U+FF9E/U+FF9F) are grapheme extenders, so each kana+mark pair is ONE cluster: the caret + // steps over a pair as a single editing unit (it never lands between a kana and its mark), even though + // each pair displays as two columns. See https://github.com/waf/PrettyPrompt/issues/270. + private const string Pug = "パグ"; + + [Theory] + [InlineData(0, 2)] // past パ (both chars of the first cluster) + [InlineData(2, 4)] // past グ + [InlineData(4, 4)] // clamped at end + public void NextBoundary_TreatsKanaPlusSoundMarkAsOneCluster(int index, int expected) + => Assert.Equal(expected, Grapheme.NextBoundary(Pug, index)); + + [Theory] + [InlineData(4, 2)] // before グ + [InlineData(2, 0)] // before パ + [InlineData(0, 0)] // clamped at 0 + public void PreviousBoundary_TreatsKanaPlusSoundMarkAsOneCluster(int index, int expected) + => Assert.Equal(expected, Grapheme.PreviousBoundary(Pug, index)); } diff --git a/tests/PrettyPrompt.Tests/ScreenTests.cs b/tests/PrettyPrompt.Tests/ScreenTests.cs index f8eafe2..5ca0ac8 100644 --- a/tests/PrettyPrompt.Tests/ScreenTests.cs +++ b/tests/PrettyPrompt.Tests/ScreenTests.cs @@ -66,6 +66,12 @@ public class ScreenTests // base char + combining mark is a single-column cluster: the cursor lands one column past it, not two. [InlineData("e\u0301", 1)] // e + combining acute accent (decomposed "é") [InlineData("ae\u0301b", 3)] // a + combining "é" + b + // Halfwidth katakana + (semi-)voiced sound mark: each kana+mark pair is ONE grapheme cluster, but the + // sound mark is a SPACING extender that takes its own halfwidth cell, so a pair is two columns. The + // cursor must land at column 4 past "パグ" (= パグ "pug"), matching what the terminal renders - not 2. + // See https://github.com/microsoft/terminal/issues/18087 and issue #270. + [InlineData("パグ", 4)] // パグ = U+FF8A + semi-voiced U+FF9F + U+FF78 + voiced U+FF9E + [InlineData("aパb", 4)] // a + パ kana+mark cluster (2 columns) + b public void ScreenCursorPositionTest(string text, int expectedCursorPosition) { var screen = new Screen( diff --git a/tests/PrettyPrompt.Tests/UnicodeWidthTests.cs b/tests/PrettyPrompt.Tests/UnicodeWidthTests.cs index 477c26d..2738b53 100644 --- a/tests/PrettyPrompt.Tests/UnicodeWidthTests.cs +++ b/tests/PrettyPrompt.Tests/UnicodeWidthTests.cs @@ -45,6 +45,17 @@ public class UnicodeWidthTests [InlineData("⚠️", 2)] // ⚠️ warning sign + VS16, emoji presentation = 2 columns [InlineData("abc⚠️def", 8)] // surrounded: abc (3) + ⚠️ (2) + def (3) [InlineData("ℹ️", 2)] // ℹ️ information source + VS16 + + // Halfwidth katakana voiced/semi-voiced sound marks (U+FF9E ゙, U+FF9F ゚) are spacing grapheme EXTENDERS: + // StringInfo clusters each with the preceding kana, but - unlike a zero-width combining mark that overlays + // its base - each renders in its own halfwidth cell, so a kana+mark cluster is two columns. The per-char + // GetWidth path already counts these (wcwidth gives them 1); the cluster path must match. + // See https://github.com/microsoft/terminal/issues/18087 and https://github.com/waf/PrettyPrompt/issues/270. + [InlineData("パグ", 4)] // パグ = パグ ("pug" in halfwidth katakana): 4 columns, not 2 + [InlineData("パ", 2)] // パ one kana + semi-voiced sound mark (U+FF9F) cluster = 2 columns + [InlineData("グ", 2)] // グ one kana + voiced sound mark (U+FF9E) cluster = 2 columns + [InlineData("゙", 1)] // a lone voiced sound mark is its own one-column cluster + [InlineData("aパb", 4)] // surrounded: a (1) + パ (2) + b (1) public void GetWidth_ReturnsExpectedDisplayWidth(string text, int expectedWidth) { Assert.Equal(expectedWidth, UnicodeWidth.GetWidth(text)); @@ -60,6 +71,11 @@ public void GetWidth_ReturnsExpectedDisplayWidth(string text, int expectedWidth) [InlineData("\u4E66", 2)] // 书 wide [InlineData("⚠", 1)] // ⚠ warning sign on its own = text presentation, 1 column [InlineData("⚠️", 2)] // ⚠️ warning sign + VS16 = emoji presentation, 2 columns + // halfwidth kana + halfwidth (semi-)voiced sound mark: the mark is a spacing extender, not a zero-width + // combining mark, so it adds its own column - see microsoft/terminal#18087. + [InlineData("パ", 2)] // パ = U+FF8A + semi-voiced sound mark U+FF9F + [InlineData("グ", 2)] // グ = U+FF78 + voiced sound mark U+FF9E + [InlineData("゙", 1)] // a lone halfwidth voiced sound mark = 1 column public void GetGraphemeClusterWidth_IsCappedAtTwo(string cluster, int expectedWidth) { Assert.Equal(expectedWidth, UnicodeWidth.GetGraphemeClusterWidth(cluster));