Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions Src/Common/Controls/XMLViews/ConfiguredExport.cs
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,9 @@ orderby s.Length descending
foreach (var ignorableString in ignorablesLongToShort)
{
// if the headword starts with the ignorable chop it off.
if (headwordLC.StartsWith(ignorableString))
// Ordinal comparison: a culture-sensitive match could chop a prefix that is only
// linguistically (not code-point) equal to an ignorable string.
if (headwordLC.StartsWith(ignorableString, StringComparison.Ordinal))
{
headwordLC = headwordLC.Substring(ignorableString.Length);
break;
Expand All @@ -683,10 +685,14 @@ orderby s.Length descending
return ""; // check again

// If the headword begins with a primary digraph then use that as the first character without doing any replacement.
// The comparison must be ordinal: the default (culture-sensitive) StartsWith performs a linguistic match,
// which on .NET Framework (Windows NLS) treats distinct letters such as ғ (U+0493) and the digraph
// г̊ (U+0433 U+030A) as equivalent. That wrongly folded ғ headwords into the г̊ letter group, so they
// never received their own letter header. See ConfiguredExportTests.GetLeadChar tests.
string firstChar = null;
foreach (var primaryDigraph in wsDigraphMap[ws.Id].Where(digraph => digraph.Value == CollationLevel.primary))
{
if (headwordLC.StartsWith(cf.ToLower(primaryDigraph.Key)))
if (headwordLC.StartsWith(cf.ToLower(primaryDigraph.Key), StringComparison.Ordinal))
firstChar = cf.ToLower(primaryDigraph.Key);
}

Expand Down Expand Up @@ -725,7 +731,9 @@ orderby s.Length descending
firstChar = headwordLC.Substring(0, cnt);
foreach (var sortChar in sortChars)
{
if (headwordLC.StartsWith(sortChar))
// Ordinal comparison: see the note on the primary-digraph check above. A culture-sensitive
// match here would re-introduce the same digraph-folding bug (e.g. ғ folded into г̊).
if (headwordLC.StartsWith(sortChar, StringComparison.Ordinal))
{
if (firstChar.Length < sortChar.Length)
firstChar = sortChar;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,47 @@ public void XHTMLExportGetLeadChar_UsesCaseAlias()
Assert.That(data, Is.EqualTo("\u0131"), "When using Azerbaijani casing, dotted and undotted I's are different letters.");
}

[Test]
public void XHTMLExportGetLeadChar_SingleCodepointLetterNotFoldedIntoSimilarDigraph()
{
// A headword beginning with U+0493 (CYRILLIC SMALL LETTER GHE WITH STROKE) must be filed under its own
// letter header, not folded into the visually-similar primary digraph U+0433 U+030A (ghe + combining
// ring above). GetLeadChar previously matched the lead character with a culture-sensitive
// String.StartsWith; on .NET Framework (Windows NLS) the linguistic collation treats U+0493 as
// equivalent to U+0433 U+030A, so the primary-digraph check returned the digraph for a U+0493 headword
// and every such entry was merged into the digraph's section with no letter header of its own.
const string ghe = "\u0433"; // CYRILLIC SMALL LETTER GHE
const string gheCap = "\u0413"; // CYRILLIC CAPITAL LETTER GHE
const string gheStroke = "\u0493"; // CYRILLIC SMALL LETTER GHE WITH STROKE
const string gheStrokeCap = "\u0492"; // CYRILLIC CAPITAL LETTER GHE WITH STROKE
const string gje = "\u0453"; // CYRILLIC SMALL LETTER GJE
const string gjeCap = "\u0403"; // CYRILLIC CAPITAL LETTER GJE
const string ring = "\u030a"; // COMBINING RING ABOVE
const string caron = "\u030c"; // COMBINING CARON
const string a = "\u0430"; // CYRILLIC SMALL LETTER A

// icuRules below encode: &ghe < gje <<< gjeCap < (ghe+ring) <<< (gheCap+ring)
// then: &gheStroke < (gheStroke+caron) <<< (gheStrokeCap+caron) < (gheStroke+ring) <<< (gheStrokeCap+ring)
// (ghe+ring) = U+0433 U+030A is a primary digraph; gheStroke = U+0493 is its own primary letter.
var icuRules =
"&" + ghe + "<" + gje + "<<<" + gjeCap + "<" + ghe + ring + "<<<" + gheCap + ring + Environment.NewLine +
"&" + gheStroke + "<" + gheStroke + caron + "<<<" + gheStrokeCap + caron +
"<" + gheStroke + ring + "<<<" + gheStrokeCap + ring;

Cache.ServiceLocator.WritingSystemManager.GetOrSet("kca", out var wsDef);
wsDef.DefaultCollation = new IcuRulesCollationDefinition("standard") { IcuRules = icuRules };
Cache.ServiceLocator.WritingSystems.AddToCurrentVernacularWritingSystems(wsDef);
var wsDigraphMap = new Dictionary<string, Dictionary<string, ConfiguredExport.CollationLevel>>();
var wsCharEquivalentMap = new Dictionary<string, Dictionary<string, string>>();
var wsIgnorableCharMap = new Dictionary<string, ISet<string>>();

string data = null;
Assert.DoesNotThrow(() => data = ConfiguredExport.GetLeadChar(
gheStroke + a, "kca", wsDigraphMap, wsCharEquivalentMap, wsIgnorableCharMap, null, Cache));
Assert.That(data, Is.EqualTo(gheStroke),
"Headword U+0493 U+0430 must be filed under U+0493 (ghe with stroke), not folded into the U+0433 U+030A digraph.");
}

/// <summary>
/// Test verifies minimal behavior added for sort rules other than Toolbox and ICU
/// (which currently does something minimal, enough to prevent crashes).
Expand Down
Loading