diff --git a/Src/Common/Controls/XMLViews/ConfiguredExport.cs b/Src/Common/Controls/XMLViews/ConfiguredExport.cs index 8967faec13..c1d0c9f5f5 100644 --- a/Src/Common/Controls/XMLViews/ConfiguredExport.cs +++ b/Src/Common/Controls/XMLViews/ConfiguredExport.cs @@ -672,7 +672,9 @@ orderby s.Length descending foreach (var ignorableString in ignorablesLongToShort) { // if the headword starts with the ignorable chop it off. - if (headwordLC.StartsWith(ignorableString)) + // Ordinal comparison: a culture-sensitive match could chop a prefix that is only + // linguistically (not code-point) equal to an ignorable string. + if (headwordLC.StartsWith(ignorableString, StringComparison.Ordinal)) { headwordLC = headwordLC.Substring(ignorableString.Length); break; @@ -683,10 +685,14 @@ orderby s.Length descending return ""; // check again // If the headword begins with a primary digraph then use that as the first character without doing any replacement. + // The comparison must be ordinal: the default (culture-sensitive) StartsWith performs a linguistic match, + // which on .NET Framework (Windows NLS) treats distinct letters such as ғ (U+0493) and the digraph + // г̊ (U+0433 U+030A) as equivalent. That wrongly folded ғ headwords into the г̊ letter group, so they + // never received their own letter header. See ConfiguredExportTests.GetLeadChar tests. string firstChar = null; foreach (var primaryDigraph in wsDigraphMap[ws.Id].Where(digraph => digraph.Value == CollationLevel.primary)) { - if (headwordLC.StartsWith(cf.ToLower(primaryDigraph.Key))) + if (headwordLC.StartsWith(cf.ToLower(primaryDigraph.Key), StringComparison.Ordinal)) firstChar = cf.ToLower(primaryDigraph.Key); } @@ -725,7 +731,9 @@ orderby s.Length descending firstChar = headwordLC.Substring(0, cnt); foreach (var sortChar in sortChars) { - if (headwordLC.StartsWith(sortChar)) + // Ordinal comparison: see the note on the primary-digraph check above. A culture-sensitive + // match here would re-introduce the same digraph-folding bug (e.g. ғ folded into г̊). + if (headwordLC.StartsWith(sortChar, StringComparison.Ordinal)) { if (firstChar.Length < sortChar.Length) firstChar = sortChar; diff --git a/Src/Common/Controls/XMLViews/XMLViewsTests/ConfiguredExportTests.cs b/Src/Common/Controls/XMLViews/XMLViewsTests/ConfiguredExportTests.cs index ff07fe700b..268b3efb3c 100644 --- a/Src/Common/Controls/XMLViews/XMLViewsTests/ConfiguredExportTests.cs +++ b/Src/Common/Controls/XMLViews/XMLViewsTests/ConfiguredExportTests.cs @@ -431,6 +431,47 @@ public void XHTMLExportGetLeadChar_UsesCaseAlias() Assert.That(data, Is.EqualTo("\u0131"), "When using Azerbaijani casing, dotted and undotted I's are different letters."); } + [Test] + public void XHTMLExportGetLeadChar_SingleCodepointLetterNotFoldedIntoSimilarDigraph() + { + // A headword beginning with U+0493 (CYRILLIC SMALL LETTER GHE WITH STROKE) must be filed under its own + // letter header, not folded into the visually-similar primary digraph U+0433 U+030A (ghe + combining + // ring above). GetLeadChar previously matched the lead character with a culture-sensitive + // String.StartsWith; on .NET Framework (Windows NLS) the linguistic collation treats U+0493 as + // equivalent to U+0433 U+030A, so the primary-digraph check returned the digraph for a U+0493 headword + // and every such entry was merged into the digraph's section with no letter header of its own. + const string ghe = "\u0433"; // CYRILLIC SMALL LETTER GHE + const string gheCap = "\u0413"; // CYRILLIC CAPITAL LETTER GHE + const string gheStroke = "\u0493"; // CYRILLIC SMALL LETTER GHE WITH STROKE + const string gheStrokeCap = "\u0492"; // CYRILLIC CAPITAL LETTER GHE WITH STROKE + const string gje = "\u0453"; // CYRILLIC SMALL LETTER GJE + const string gjeCap = "\u0403"; // CYRILLIC CAPITAL LETTER GJE + const string ring = "\u030a"; // COMBINING RING ABOVE + const string caron = "\u030c"; // COMBINING CARON + const string a = "\u0430"; // CYRILLIC SMALL LETTER A + + // icuRules below encode: &ghe < gje <<< gjeCap < (ghe+ring) <<< (gheCap+ring) + // then: &gheStroke < (gheStroke+caron) <<< (gheStrokeCap+caron) < (gheStroke+ring) <<< (gheStrokeCap+ring) + // (ghe+ring) = U+0433 U+030A is a primary digraph; gheStroke = U+0493 is its own primary letter. + var icuRules = + "&" + ghe + "<" + gje + "<<<" + gjeCap + "<" + ghe + ring + "<<<" + gheCap + ring + Environment.NewLine + + "&" + gheStroke + "<" + gheStroke + caron + "<<<" + gheStrokeCap + caron + + "<" + gheStroke + ring + "<<<" + gheStrokeCap + ring; + + Cache.ServiceLocator.WritingSystemManager.GetOrSet("kca", out var wsDef); + wsDef.DefaultCollation = new IcuRulesCollationDefinition("standard") { IcuRules = icuRules }; + Cache.ServiceLocator.WritingSystems.AddToCurrentVernacularWritingSystems(wsDef); + var wsDigraphMap = new Dictionary>(); + var wsCharEquivalentMap = new Dictionary>(); + var wsIgnorableCharMap = new Dictionary>(); + + string data = null; + Assert.DoesNotThrow(() => data = ConfiguredExport.GetLeadChar( + gheStroke + a, "kca", wsDigraphMap, wsCharEquivalentMap, wsIgnorableCharMap, null, Cache)); + Assert.That(data, Is.EqualTo(gheStroke), + "Headword U+0493 U+0430 must be filed under U+0493 (ghe with stroke), not folded into the U+0433 U+030A digraph."); + } + /// /// Test verifies minimal behavior added for sort rules other than Toolbox and ICU /// (which currently does something minimal, enough to prevent crashes).