From 7c3c644746daaad546a89079cc503ff475ebd6e6 Mon Sep 17 00:00:00 2001
From: Will Fuqua <wafuqua@gmail.com>
Date: Sun, 7 Jun 2026 14:09:22 +0700
Subject: [PATCH] Handle spacing grapheme extenders U+FF9E-U+FF9F

---
 src/PrettyPrompt/Rendering/UnicodeWidth.cs    | 40 ++++++++++++++-----
 tests/PrettyPrompt.Tests/GraphemeTests.cs     | 20 ++++++++++
 tests/PrettyPrompt.Tests/ScreenTests.cs       |  6 +++
 tests/PrettyPrompt.Tests/UnicodeWidthTests.cs | 16 ++++++++
 4 files changed, 71 insertions(+), 11 deletions(-)
diff --git a/src/PrettyPrompt/Rendering/UnicodeWidth.cs b/src/PrettyPrompt/Rendering/UnicodeWidth.cs
index 75f3e26..d122e62 100644
--- a/src/PrettyPrompt/Rendering/UnicodeWidth.cs
+++ b/src/PrettyPrompt/Rendering/UnicodeWidth.cs
@@ -15,17 +15,21 @@ namespace PrettyPrompt.Rendering;
 /// <summary>
 /// Calculates how many terminal columns ("cells") a character or string occupies.
 ///
-/// <para>
 /// Per-scalar widths come from <see cref="UnicodeCalculator"/> — a vendored, source-only copy of
-/// https://github.com/spectreconsole/wcwidth (Unicode 16). PrettyPrompt layers grapheme-cluster
-/// awareness on top: a cluster (an emoji ZWJ sequence such as "🤦🏼‍♂️", or a base character followed
-/// by combining marks / a variation selector) occupies the width of its <b>base scalar value</b> — the
-/// trailing scalars modify the glyph but add no columns. Each cluster's width is capped at 2, because
-/// the renderer models every cell as one or two columns wide (see <see cref="Cell"/>); without the cap,
-/// summing the parts of a cluster (e.g. base + skin-tone modifier = 2 + 2) produced widths of 3-5 and
-/// crashed cursor positioning. See https://github.com/waf/PrettyPrompt/issues/270.
-/// </para>
+/// https://github.com/spectreconsole/wcwidth (Unicode 16). On top of that PrettyPrompt adds grapheme-cluster
+/// awareness, so a multi-scalar cluster (an emoji ZWJ sequence such as "🤦🏼‍♂️", or a base character followed
+/// by combining marks / a variation selector) is sized as the single glyph the terminal draws. The rules:
+/// 
+/// - A cluster occupies the width of its <b>base scalar value</b>. Trailing combining marks, zero-width joiners,
+///   emoji modifiers (e.g. skin tones), and variation selectors shape the glyph but add no columns.
+/// - Exception: a halfwidth katakana voiced / semi-voiced sound mark (U+FF9E, U+FF9F) is a grapheme extender
+///   too, but it is a <b>spacing</b> mark that takes its own halfwidth cell rather than overlaying the base,
+///   so it adds a column — e.g. "ﾊﾟ" is one cluster but two columns.
+/// - Every cluster's width is <b>capped at 2</b>, because the renderer models each cell as one or two columns
+///   wide (see <see cref="Cell"/>). Without the cap, summing a cluster's parts (e.g. base + skin-tone modifier
+///    = 2 + 2) produced widths of 3-5 and crashed cursor positioning.
 /// </summary>
+/// <remarks>See https://github.com/waf/PrettyPrompt/issues/270</remarks>
 public static class UnicodeWidth
 {
     /// <summary>
@@ -68,7 +72,7 @@ public static int GetWidth(ReadOnlySpan<char> text)
     public static int GetGraphemeClusterWidth(ReadOnlySpan<char> cluster)
     {
         if (cluster.IsEmpty) return 0;
-        if (Rune.DecodeFromUtf16(cluster, out var baseRune, out _) != OperationStatus.Done)
+        if (Rune.DecodeFromUtf16(cluster, out var baseRune, out int baseLength) != OperationStatus.Done)
         {
             return 1; // ill-formed (e.g. a lone surrogate); be defensive and reserve a single column.
         }
@@ -77,7 +81,21 @@ public static int GetGraphemeClusterWidth(ReadOnlySpan<char> cluster)
         // that defaults to 1 - e.g. ⚠ (U+26A0) is 1 column but ⚠️ is a 2-column emoji; wcwidth misses this.
         // The length check skips a lone, base-less selector (no width of its own).
         if (cluster.Length > 1 && cluster.Contains((char)0xFE0F)) return 2;
-        return Clamp(UnicodeCalculator.GetWidth(baseRune));
+
+        int width = Clamp(UnicodeCalculator.GetWidth(baseRune));
+
+        // Halfwidth katakana voiced / semi-voiced sound marks (U+FF9E ﾞ, U+FF9F ﾟ) are SPACING grapheme
+        // extenders: StringInfo clusters each onto the preceding kana, but unlike a combining mark that
+        // overlays its base they render in their own halfwidth cell (category Lm, wcwidth 1). So e.g. "ﾊﾟ"
+        // (U+FF8A U+FF9F) is one cluster but occupies two columns. Add a column per trailing mark, which also
+        // keeps this in step with the per-char GetWidth path (it already counts them). The base path above
+        // handles a lone, base-less mark. See https://github.com/microsoft/terminal/issues/18087.
+        foreach (var c in cluster.Slice(baseLength))
+        {
+            if (c is (char)0xFF9E or (char)0xFF9F) width++;
+        }
+
+        return Math.Min(width, 2); // cap at the cell model's two columns (see remarks)
     }
 
     /// <summary>
diff --git a/tests/PrettyPrompt.Tests/GraphemeTests.cs b/tests/PrettyPrompt.Tests/GraphemeTests.cs
index 0521d4f..123c168 100644
--- a/tests/PrettyPrompt.Tests/GraphemeTests.cs
+++ b/tests/PrettyPrompt.Tests/GraphemeTests.cs
@@ -41,4 +41,24 @@ public void PreviousBoundary_StepsOverWholeCluster(int index, int expected)
     [InlineData(9, 9)]
     public void RoundDownToBoundary_SnapsMidClusterIndices(int index, int expected)
         => Assert.Equal(expected, Grapheme.RoundDownToBoundary(Text, index));
+
+    // "ﾊﾟｸﾞ" = パグ ("pug") in halfwidth katakana = U+FF8A U+FF9F U+FF78 U+FF9E. The voiced / semi-voiced
+    // sound marks (U+FF9E/U+FF9F) are grapheme extenders, so each kana+mark pair is ONE cluster: the caret
+    // steps over a pair as a single editing unit (it never lands between a kana and its mark), even though
+    // each pair displays as two columns. See https://github.com/waf/PrettyPrompt/issues/270.
+    private const string Pug = "ﾊﾟｸﾞ";
+
+    [Theory]
+    [InlineData(0, 2)] // past ﾊﾟ (both chars of the first cluster)
+    [InlineData(2, 4)] // past ｸﾞ
+    [InlineData(4, 4)] // clamped at end
+    public void NextBoundary_TreatsKanaPlusSoundMarkAsOneCluster(int index, int expected)
+        => Assert.Equal(expected, Grapheme.NextBoundary(Pug, index));
+
+    [Theory]
+    [InlineData(4, 2)] // before ｸﾞ
+    [InlineData(2, 0)] // before ﾊﾟ
+    [InlineData(0, 0)] // clamped at 0
+    public void PreviousBoundary_TreatsKanaPlusSoundMarkAsOneCluster(int index, int expected)
+        => Assert.Equal(expected, Grapheme.PreviousBoundary(Pug, index));
 }
diff --git a/tests/PrettyPrompt.Tests/ScreenTests.cs b/tests/PrettyPrompt.Tests/ScreenTests.cs
index f8eafe2..5ca0ac8 100644
--- a/tests/PrettyPrompt.Tests/ScreenTests.cs
+++ b/tests/PrettyPrompt.Tests/ScreenTests.cs
@@ -66,6 +66,12 @@ public class ScreenTests
     // base char + combining mark is a single-column cluster: the cursor lands one column past it, not two.
     [InlineData("e\u0301", 1)]                                    // e + combining acute accent (decomposed "é")
     [InlineData("ae\u0301b", 3)]                                  // a + combining "é" + b
+    // Halfwidth katakana + (semi-)voiced sound mark: each kana+mark pair is ONE grapheme cluster, but the
+    // sound mark is a SPACING extender that takes its own halfwidth cell, so a pair is two columns. The
+    // cursor must land at column 4 past "ﾊﾟｸﾞ" (= パグ "pug"), matching what the terminal renders - not 2.
+    // See https://github.com/microsoft/terminal/issues/18087 and issue #270.
+    [InlineData("ﾊﾟｸﾞ", 4)]                   // ﾊﾟｸﾞ = U+FF8A + semi-voiced U+FF9F + U+FF78 + voiced U+FF9E
+    [InlineData("aﾊﾟb", 4)]                            // a + ﾊﾟ kana+mark cluster (2 columns) + b
     public void ScreenCursorPositionTest(string text, int expectedCursorPosition)
     {
         var screen = new Screen(
diff --git a/tests/PrettyPrompt.Tests/UnicodeWidthTests.cs b/tests/PrettyPrompt.Tests/UnicodeWidthTests.cs
index 477c26d..2738b53 100644
--- a/tests/PrettyPrompt.Tests/UnicodeWidthTests.cs
+++ b/tests/PrettyPrompt.Tests/UnicodeWidthTests.cs
@@ -45,6 +45,17 @@ public class UnicodeWidthTests
     [InlineData("⚠️", 2)]                               // ⚠️ warning sign + VS16, emoji presentation = 2 columns
     [InlineData("abc⚠️def", 8)]                         // surrounded: abc (3) + ⚠️ (2) + def (3)
     [InlineData("ℹ️", 2)]                               // ℹ️ information source + VS16
+
+    // Halfwidth katakana voiced/semi-voiced sound marks (U+FF9E ﾞ, U+FF9F ﾟ) are spacing grapheme EXTENDERS:
+    // StringInfo clusters each with the preceding kana, but - unlike a zero-width combining mark that overlays
+    // its base - each renders in its own halfwidth cell, so a kana+mark cluster is two columns. The per-char
+    // GetWidth path already counts these (wcwidth gives them 1); the cluster path must match.
+    // See https://github.com/microsoft/terminal/issues/18087 and https://github.com/waf/PrettyPrompt/issues/270.
+    [InlineData("ﾊﾟｸﾞ", 4)]  // ﾊﾟｸﾞ = パグ ("pug" in halfwidth katakana): 4 columns, not 2
+    [InlineData("ﾊﾟ", 2)]              // ﾊﾟ one kana + semi-voiced sound mark (U+FF9F) cluster = 2 columns
+    [InlineData("ｸﾞ", 2)]              // ｸﾞ one kana + voiced sound mark (U+FF9E) cluster = 2 columns
+    [InlineData("ﾞ", 1)]                    // a lone voiced sound mark is its own one-column cluster
+    [InlineData("aﾊﾟb", 4)]            // surrounded: a (1) + ﾊﾟ (2) + b (1)
     public void GetWidth_ReturnsExpectedDisplayWidth(string text, int expectedWidth)
     {
         Assert.Equal(expectedWidth, UnicodeWidth.GetWidth(text));
@@ -60,6 +71,11 @@ public void GetWidth_ReturnsExpectedDisplayWidth(string text, int expectedWidth)
     [InlineData("\u4E66", 2)]                                     // 书 wide
     [InlineData("⚠", 1)]                                        // ⚠ warning sign on its own = text presentation, 1 column
     [InlineData("⚠️", 2)]                                        // ⚠️ warning sign + VS16 = emoji presentation, 2 columns
+    // halfwidth kana + halfwidth (semi-)voiced sound mark: the mark is a spacing extender, not a zero-width
+    // combining mark, so it adds its own column - see microsoft/terminal#18087.
+    [InlineData("ﾊﾟ", 2)]                                       // ﾊﾟ = U+FF8A + semi-voiced sound mark U+FF9F
+    [InlineData("ｸﾞ", 2)]                                       // ｸﾞ = U+FF78 + voiced sound mark U+FF9E
+    [InlineData("ﾞ", 1)]                                          // a lone halfwidth voiced sound mark = 1 column
     public void GetGraphemeClusterWidth_IsCappedAtTwo(string cluster, int expectedWidth)
     {
         Assert.Equal(expectedWidth, UnicodeWidth.GetGraphemeClusterWidth(cluster));