From 42953d166af5e9ca6a7da7ab98d109f37d44cd4d Mon Sep 17 00:00:00 2001 From: Jason Naylor Date: Mon, 15 Jun 2026 14:29:59 -0700 Subject: [PATCH] Fix missing letter header for U+0493 headwords in dictionary export GetLeadChar matched the lead character against digraph and ignorable prefixes with the default String.StartsWith, which is culture-sensitive. On .NET Framework (Windows NLS) that linguistic comparison treats U+0493 (CYRILLIC SMALL LETTER GHE WITH STROKE) as equivalent to the digraph U+0433 U+030A (ghe plus combining ring above). A headword starting with U+0493 therefore matched the primary digraph and was folded into that digraph's letter group, leaving every such entry without its own header. Compare the three prefix checks in GetLeadChar with StringComparison.Ordinal so matching is by code point, not collation. ICU-backed globalization was unaffected, which is why this reproduced only on .NET Framework. Add a regression test asserting GetLeadChar returns U+0493 for a U+0493 headword under the affected ICU rules. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Controls/XMLViews/ConfiguredExport.cs | 14 +++++-- .../XMLViewsTests/ConfiguredExportTests.cs | 41 +++++++++++++++++++ 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/Src/Common/Controls/XMLViews/ConfiguredExport.cs b/Src/Common/Controls/XMLViews/ConfiguredExport.cs index 8967faec13..c1d0c9f5f5 100644 --- a/Src/Common/Controls/XMLViews/ConfiguredExport.cs +++ b/Src/Common/Controls/XMLViews/ConfiguredExport.cs @@ -672,7 +672,9 @@ orderby s.Length descending foreach (var ignorableString in ignorablesLongToShort) { // if the headword starts with the ignorable chop it off. - if (headwordLC.StartsWith(ignorableString)) + // Ordinal comparison: a culture-sensitive match could chop a prefix that is only + // linguistically (not code-point) equal to an ignorable string. + if (headwordLC.StartsWith(ignorableString, StringComparison.Ordinal)) { headwordLC = headwordLC.Substring(ignorableString.Length); break; @@ -683,10 +685,14 @@ orderby s.Length descending return ""; // check again // If the headword begins with a primary digraph then use that as the first character without doing any replacement. + // The comparison must be ordinal: the default (culture-sensitive) StartsWith performs a linguistic match, + // which on .NET Framework (Windows NLS) treats distinct letters such as ғ (U+0493) and the digraph + // г̊ (U+0433 U+030A) as equivalent. That wrongly folded ғ headwords into the г̊ letter group, so they + // never received their own letter header. See ConfiguredExportTests.GetLeadChar tests. string firstChar = null; foreach (var primaryDigraph in wsDigraphMap[ws.Id].Where(digraph => digraph.Value == CollationLevel.primary)) { - if (headwordLC.StartsWith(cf.ToLower(primaryDigraph.Key))) + if (headwordLC.StartsWith(cf.ToLower(primaryDigraph.Key), StringComparison.Ordinal)) firstChar = cf.ToLower(primaryDigraph.Key); } @@ -725,7 +731,9 @@ orderby s.Length descending firstChar = headwordLC.Substring(0, cnt); foreach (var sortChar in sortChars) { - if (headwordLC.StartsWith(sortChar)) + // Ordinal comparison: see the note on the primary-digraph check above. A culture-sensitive + // match here would re-introduce the same digraph-folding bug (e.g. ғ folded into г̊). + if (headwordLC.StartsWith(sortChar, StringComparison.Ordinal)) { if (firstChar.Length < sortChar.Length) firstChar = sortChar; diff --git a/Src/Common/Controls/XMLViews/XMLViewsTests/ConfiguredExportTests.cs b/Src/Common/Controls/XMLViews/XMLViewsTests/ConfiguredExportTests.cs index ff07fe700b..268b3efb3c 100644 --- a/Src/Common/Controls/XMLViews/XMLViewsTests/ConfiguredExportTests.cs +++ b/Src/Common/Controls/XMLViews/XMLViewsTests/ConfiguredExportTests.cs @@ -431,6 +431,47 @@ public void XHTMLExportGetLeadChar_UsesCaseAlias() Assert.That(data, Is.EqualTo("\u0131"), "When using Azerbaijani casing, dotted and undotted I's are different letters."); } + [Test] + public void XHTMLExportGetLeadChar_SingleCodepointLetterNotFoldedIntoSimilarDigraph() + { + // A headword beginning with U+0493 (CYRILLIC SMALL LETTER GHE WITH STROKE) must be filed under its own + // letter header, not folded into the visually-similar primary digraph U+0433 U+030A (ghe + combining + // ring above). GetLeadChar previously matched the lead character with a culture-sensitive + // String.StartsWith; on .NET Framework (Windows NLS) the linguistic collation treats U+0493 as + // equivalent to U+0433 U+030A, so the primary-digraph check returned the digraph for a U+0493 headword + // and every such entry was merged into the digraph's section with no letter header of its own. + const string ghe = "\u0433"; // CYRILLIC SMALL LETTER GHE + const string gheCap = "\u0413"; // CYRILLIC CAPITAL LETTER GHE + const string gheStroke = "\u0493"; // CYRILLIC SMALL LETTER GHE WITH STROKE + const string gheStrokeCap = "\u0492"; // CYRILLIC CAPITAL LETTER GHE WITH STROKE + const string gje = "\u0453"; // CYRILLIC SMALL LETTER GJE + const string gjeCap = "\u0403"; // CYRILLIC CAPITAL LETTER GJE + const string ring = "\u030a"; // COMBINING RING ABOVE + const string caron = "\u030c"; // COMBINING CARON + const string a = "\u0430"; // CYRILLIC SMALL LETTER A + + // icuRules below encode: &ghe < gje <<< gjeCap < (ghe+ring) <<< (gheCap+ring) + // then: &gheStroke < (gheStroke+caron) <<< (gheStrokeCap+caron) < (gheStroke+ring) <<< (gheStrokeCap+ring) + // (ghe+ring) = U+0433 U+030A is a primary digraph; gheStroke = U+0493 is its own primary letter. + var icuRules = + "&" + ghe + "<" + gje + "<<<" + gjeCap + "<" + ghe + ring + "<<<" + gheCap + ring + Environment.NewLine + + "&" + gheStroke + "<" + gheStroke + caron + "<<<" + gheStrokeCap + caron + + "<" + gheStroke + ring + "<<<" + gheStrokeCap + ring; + + Cache.ServiceLocator.WritingSystemManager.GetOrSet("kca", out var wsDef); + wsDef.DefaultCollation = new IcuRulesCollationDefinition("standard") { IcuRules = icuRules }; + Cache.ServiceLocator.WritingSystems.AddToCurrentVernacularWritingSystems(wsDef); + var wsDigraphMap = new Dictionary>(); + var wsCharEquivalentMap = new Dictionary>(); + var wsIgnorableCharMap = new Dictionary>(); + + string data = null; + Assert.DoesNotThrow(() => data = ConfiguredExport.GetLeadChar( + gheStroke + a, "kca", wsDigraphMap, wsCharEquivalentMap, wsIgnorableCharMap, null, Cache)); + Assert.That(data, Is.EqualTo(gheStroke), + "Headword U+0493 U+0430 must be filed under U+0493 (ghe with stroke), not folded into the U+0433 U+030A digraph."); + } + /// /// Test verifies minimal behavior added for sort rules other than Toolbox and ICU /// (which currently does something minimal, enough to prevent crashes).