diff --git a/Src/Utilities/SfmToXml/Converter.cs b/Src/Utilities/SfmToXml/Converter.cs index 76ea7c4a4f..1eb2cdb048 100644 --- a/Src/Utilities/SfmToXml/Converter.cs +++ b/Src/Utilities/SfmToXml/Converter.cs @@ -2583,7 +2583,11 @@ public void ProcessSFMandData(string currentSfm, byte[] sfmData, int lineNumber, processedText = processedText.Trim(); // remove whitespace // make sure the data is only in the following range: - // 0x09, 0x0a, 0x0d, 0x20-0xd7ff, 0xe000-0xfffd + // 0x09, 0x0a, 0x0d, 0x20-0xd7ff, 0xe000-0xfffd, 0x10000-0x10ffff + // The last range (supplementary planes) is encoded in .NET strings as + // surrogate pairs (a high surrogate 0xd800-0xdbff followed by a low + // surrogate 0xdc00-0xdfff). These are valid XML characters and must be + // preserved; only lone/unpaired surrogates are invalid (LT-20644). // not using foreach so chars can be removed during processing w/o messing up the iterator System.Text.StringBuilder strTemp = new System.Text.StringBuilder(processedText); int lastPos = strTemp.Length; @@ -2597,6 +2601,10 @@ public void ProcessSFMandData(string currentSfm, byte[] sfmData, int lineNumber, { pos++; // valid data } + else if (char.IsHighSurrogate(c) && pos + 1 < lastPos && char.IsLowSurrogate(strTemp[pos + 1])) + { + pos += 2; // valid surrogate pair (supplementary-plane character) + } else { Log.AddError(m_SfmFileName, lineNumber, diff --git a/Src/Utilities/SfmToXml/Sfm2XmlTests/ConverterTests.cs b/Src/Utilities/SfmToXml/Sfm2XmlTests/ConverterTests.cs index 01d66e504f..131f619660 100644 --- a/Src/Utilities/SfmToXml/Sfm2XmlTests/ConverterTests.cs +++ b/Src/Utilities/SfmToXml/Sfm2XmlTests/ConverterTests.cs @@ -135,6 +135,61 @@ public void ConverterNormalizesTextToNfd() $"Expected NFD normalization, but got: {string.Join(" ", outputText.Select(c => $"U+{(int)c:X4}"))}"); } + [Test] + public void ConverterPreservesSupplementaryPlaneCharacters() + { + // Wancho letters in the Supplementary Multilingual Plane (U+1E2C0 block), + // each encoded in .NET as a UTF-16 surrogate pair. Previously the importer + // stripped these as "invalid" characters because the validity check omitted + // the U+10000-U+10FFFF range (LT-20644). + const string supplementary = "\U0001E2CC\U0001E2C1\U0001E2D4"; // three Wancho letters + + string sfmString = $@"\lx {supplementary} +\ps n +\ge test"; + + const string mappingString = @" + + + + + + + + + + + + + + + + +"; + + var sfmFile = Path.GetTempFileName(); + var mappingFile = Path.GetTempFileName(); + var outputFile = Path.GetTempFileName(); + + // SFM files are read as UTF-8 by the importer. + File.WriteAllText(sfmFile, sfmString, new UTF8Encoding(false)); + File.WriteAllText(mappingFile, mappingString); + + var converter = new Converter(null); + converter.Convert(sfmFile, mappingFile, outputFile); + + var doc = new XmlDocument(); + doc.Load(outputFile); + + var lexemeNode = doc.SelectSingleNode("//lx | //LexemeForm | //Lexeme"); + Assert.NotNull(lexemeNode, "Lexeme node was not found in output XML"); + + // The supplementary characters must survive the import unchanged + // (NFD normalization leaves these code points untouched). + Assert.AreEqual(supplementary, lexemeNode.InnerText, + "Supplementary-plane characters were not preserved during SFM import"); + } + private static bool IsNfd(string s) { return s == s.Normalize(NormalizationForm.FormD);