Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 29 additions & 11 deletions src/PrettyPrompt/Rendering/UnicodeWidth.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,21 @@ namespace PrettyPrompt.Rendering;
/// <summary>
/// Calculates how many terminal columns ("cells") a character or string occupies.
///
/// <para>
/// Per-scalar widths come from <see cref="UnicodeCalculator"/> — a vendored, source-only copy of
/// https://github.com/spectreconsole/wcwidth (Unicode 16). PrettyPrompt layers grapheme-cluster
/// awareness on top: a cluster (an emoji ZWJ sequence such as "🤦🏼‍♂️", or a base character followed
/// by combining marks / a variation selector) occupies the width of its <b>base scalar value</b> — the
/// trailing scalars modify the glyph but add no columns. Each cluster's width is capped at 2, because
/// the renderer models every cell as one or two columns wide (see <see cref="Cell"/>); without the cap,
/// summing the parts of a cluster (e.g. base + skin-tone modifier = 2 + 2) produced widths of 3-5 and
/// crashed cursor positioning. See https://github.com/waf/PrettyPrompt/issues/270.
/// </para>
/// https://github.com/spectreconsole/wcwidth (Unicode 16). On top of that PrettyPrompt adds grapheme-cluster
/// awareness, so a multi-scalar cluster (an emoji ZWJ sequence such as "🤦🏼‍♂️", or a base character followed
/// by combining marks / a variation selector) is sized as the single glyph the terminal draws. The rules:
///
/// - A cluster occupies the width of its <b>base scalar value</b>. Trailing combining marks, zero-width joiners,
/// emoji modifiers (e.g. skin tones), and variation selectors shape the glyph but add no columns.
/// - Exception: a halfwidth katakana voiced / semi-voiced sound mark (U+FF9E, U+FF9F) is a grapheme extender
/// too, but it is a <b>spacing</b> mark that takes its own halfwidth cell rather than overlaying the base,
/// so it adds a column — e.g. "パ" is one cluster but two columns.
/// - Every cluster's width is <b>capped at 2</b>, because the renderer models each cell as one or two columns
/// wide (see <see cref="Cell"/>). Without the cap, summing a cluster's parts (e.g. base + skin-tone modifier
/// = 2 + 2) produced widths of 3-5 and crashed cursor positioning.
/// </summary>
/// <remarks>See https://github.com/waf/PrettyPrompt/issues/270</remarks>
public static class UnicodeWidth
{
/// <summary>
Expand Down Expand Up @@ -68,7 +72,7 @@ public static int GetWidth(ReadOnlySpan<char> text)
public static int GetGraphemeClusterWidth(ReadOnlySpan<char> cluster)
{
if (cluster.IsEmpty) return 0;
if (Rune.DecodeFromUtf16(cluster, out var baseRune, out _) != OperationStatus.Done)
if (Rune.DecodeFromUtf16(cluster, out var baseRune, out int baseLength) != OperationStatus.Done)
{
return 1; // ill-formed (e.g. a lone surrogate); be defensive and reserve a single column.
}
Expand All @@ -77,7 +81,21 @@ public static int GetGraphemeClusterWidth(ReadOnlySpan<char> cluster)
// that defaults to 1 - e.g. ⚠ (U+26A0) is 1 column but ⚠️ is a 2-column emoji; wcwidth misses this.
// The length check skips a lone, base-less selector (no width of its own).
if (cluster.Length > 1 && cluster.Contains((char)0xFE0F)) return 2;
return Clamp(UnicodeCalculator.GetWidth(baseRune));

int width = Clamp(UnicodeCalculator.GetWidth(baseRune));

// Halfwidth katakana voiced / semi-voiced sound marks (U+FF9E ゙, U+FF9F ゚) are SPACING grapheme
// extenders: StringInfo clusters each onto the preceding kana, but unlike a combining mark that
// overlays its base they render in their own halfwidth cell (category Lm, wcwidth 1). So e.g. "パ"
// (U+FF8A U+FF9F) is one cluster but occupies two columns. Add a column per trailing mark, which also
// keeps this in step with the per-char GetWidth path (it already counts them). The base path above
// handles a lone, base-less mark. See https://github.com/microsoft/terminal/issues/18087.
foreach (var c in cluster.Slice(baseLength))
{
if (c is (char)0xFF9E or (char)0xFF9F) width++;
}

return Math.Min(width, 2); // cap at the cell model's two columns (see remarks)
}

/// <summary>
Expand Down
20 changes: 20 additions & 0 deletions tests/PrettyPrompt.Tests/GraphemeTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,24 @@ public void PreviousBoundary_StepsOverWholeCluster(int index, int expected)
[InlineData(9, 9)]
public void RoundDownToBoundary_SnapsMidClusterIndices(int index, int expected)
=> Assert.Equal(expected, Grapheme.RoundDownToBoundary(Text, index));

// "パグ" = パグ ("pug") in halfwidth katakana = U+FF8A U+FF9F U+FF78 U+FF9E. The voiced / semi-voiced
// sound marks (U+FF9E/U+FF9F) are grapheme extenders, so each kana+mark pair is ONE cluster: the caret
// steps over a pair as a single editing unit (it never lands between a kana and its mark), even though
// each pair displays as two columns. See https://github.com/waf/PrettyPrompt/issues/270.
private const string Pug = "パグ";

[Theory]
[InlineData(0, 2)] // past パ (both chars of the first cluster)
[InlineData(2, 4)] // past グ
[InlineData(4, 4)] // clamped at end
public void NextBoundary_TreatsKanaPlusSoundMarkAsOneCluster(int index, int expected)
=> Assert.Equal(expected, Grapheme.NextBoundary(Pug, index));

[Theory]
[InlineData(4, 2)] // before グ
[InlineData(2, 0)] // before パ
[InlineData(0, 0)] // clamped at 0
public void PreviousBoundary_TreatsKanaPlusSoundMarkAsOneCluster(int index, int expected)
=> Assert.Equal(expected, Grapheme.PreviousBoundary(Pug, index));
}
6 changes: 6 additions & 0 deletions tests/PrettyPrompt.Tests/ScreenTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ public class ScreenTests
// base char + combining mark is a single-column cluster: the cursor lands one column past it, not two.
[InlineData("e\u0301", 1)] // e + combining acute accent (decomposed "é")
[InlineData("ae\u0301b", 3)] // a + combining "é" + b
// Halfwidth katakana + (semi-)voiced sound mark: each kana+mark pair is ONE grapheme cluster, but the
// sound mark is a SPACING extender that takes its own halfwidth cell, so a pair is two columns. The
// cursor must land at column 4 past "パグ" (= パグ "pug"), matching what the terminal renders - not 2.
// See https://github.com/microsoft/terminal/issues/18087 and issue #270.
[InlineData("パグ", 4)] // パグ = U+FF8A + semi-voiced U+FF9F + U+FF78 + voiced U+FF9E
[InlineData("aパb", 4)] // a + パ kana+mark cluster (2 columns) + b
public void ScreenCursorPositionTest(string text, int expectedCursorPosition)
{
var screen = new Screen(
Expand Down
16 changes: 16 additions & 0 deletions tests/PrettyPrompt.Tests/UnicodeWidthTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,17 @@ public class UnicodeWidthTests
[InlineData("⚠️", 2)] // ⚠️ warning sign + VS16, emoji presentation = 2 columns
[InlineData("abc⚠️def", 8)] // surrounded: abc (3) + ⚠️ (2) + def (3)
[InlineData("ℹ️", 2)] // ℹ️ information source + VS16

// Halfwidth katakana voiced/semi-voiced sound marks (U+FF9E ゙, U+FF9F ゚) are spacing grapheme EXTENDERS:
// StringInfo clusters each with the preceding kana, but - unlike a zero-width combining mark that overlays
// its base - each renders in its own halfwidth cell, so a kana+mark cluster is two columns. The per-char
// GetWidth path already counts these (wcwidth gives them 1); the cluster path must match.
// See https://github.com/microsoft/terminal/issues/18087 and https://github.com/waf/PrettyPrompt/issues/270.
[InlineData("パグ", 4)] // パグ = パグ ("pug" in halfwidth katakana): 4 columns, not 2
[InlineData("パ", 2)] // パ one kana + semi-voiced sound mark (U+FF9F) cluster = 2 columns
[InlineData("グ", 2)] // グ one kana + voiced sound mark (U+FF9E) cluster = 2 columns
[InlineData("゙", 1)] // a lone voiced sound mark is its own one-column cluster
[InlineData("aパb", 4)] // surrounded: a (1) + パ (2) + b (1)
public void GetWidth_ReturnsExpectedDisplayWidth(string text, int expectedWidth)
{
Assert.Equal(expectedWidth, UnicodeWidth.GetWidth(text));
Expand All @@ -60,6 +71,11 @@ public void GetWidth_ReturnsExpectedDisplayWidth(string text, int expectedWidth)
[InlineData("\u4E66", 2)] // 书 wide
[InlineData("⚠", 1)] // ⚠ warning sign on its own = text presentation, 1 column
[InlineData("⚠️", 2)] // ⚠️ warning sign + VS16 = emoji presentation, 2 columns
// halfwidth kana + halfwidth (semi-)voiced sound mark: the mark is a spacing extender, not a zero-width
// combining mark, so it adds its own column - see microsoft/terminal#18087.
[InlineData("パ", 2)] // パ = U+FF8A + semi-voiced sound mark U+FF9F
[InlineData("グ", 2)] // グ = U+FF78 + voiced sound mark U+FF9E
[InlineData("゙", 1)] // a lone halfwidth voiced sound mark = 1 column
public void GetGraphemeClusterWidth_IsCappedAtTwo(string cluster, int expectedWidth)
{
Assert.Equal(expectedWidth, UnicodeWidth.GetGraphemeClusterWidth(cluster));
Expand Down
Loading