sillsdev · mark-sil · Jun 18, 2026
diff --git a/Src/Utilities/SfmToXml/Converter.cs b/Src/Utilities/SfmToXml/Converter.cs
@@ -2583,7 +2583,11 @@ public void ProcessSFMandData(string currentSfm, byte[] sfmData, int lineNumber,
 			processedText = processedText.Trim();	// remove whitespace
 
 			// make sure the data is only in the following range:
-			// 0x09, 0x0a, 0x0d, 0x20-0xd7ff, 0xe000-0xfffd
+			// 0x09, 0x0a, 0x0d, 0x20-0xd7ff, 0xe000-0xfffd, 0x10000-0x10ffff
+			// The last range (supplementary planes) is encoded in .NET strings as
+			// surrogate pairs (a high surrogate 0xd800-0xdbff followed by a low
+			// surrogate 0xdc00-0xdfff). These are valid XML characters and must be
+			// preserved; only lone/unpaired surrogates are invalid (LT-20644).
 			// not using foreach so chars can be removed during processing w/o messing up the iterator
 			System.Text.StringBuilder strTemp = new System.Text.StringBuilder(processedText);
 			int lastPos = strTemp.Length;
@@ -2597,6 +2601,10 @@ public void ProcessSFMandData(string currentSfm, byte[] sfmData, int lineNumber,
 				{
 					pos++;	// valid data
 				}
+				else if (char.IsHighSurrogate(c) && pos + 1 < lastPos && char.IsLowSurrogate(strTemp[pos + 1]))
+				{
+					pos += 2;	// valid surrogate pair (supplementary-plane character)
+				}
 				else
 				{
 					Log.AddError(m_SfmFileName, lineNumber,

diff --git a/Src/Utilities/SfmToXml/Sfm2XmlTests/ConverterTests.cs b/Src/Utilities/SfmToXml/Sfm2XmlTests/ConverterTests.cs
@@ -135,6 +135,61 @@ public void ConverterNormalizesTextToNfd()
 				$"Expected NFD normalization, but got: {string.Join(" ", outputText.Select(c => $"U+{(int)c:X4}"))}");
 		}
 
+		[Test]
+		public void ConverterPreservesSupplementaryPlaneCharacters()
+		{
+			// Wancho letters in the Supplementary Multilingual Plane (U+1E2C0 block),
+			// each encoded in .NET as a UTF-16 surrogate pair. Previously the importer
+			// stripped these as "invalid" characters because the validity check omitted
+			// the U+10000-U+10FFFF range (LT-20644).
+			const string supplementary = "\U0001E2CC\U0001E2C1\U0001E2D4"; // three Wancho letters
+
+			string sfmString = $@"\lx {supplementary}
+\ps n
+\ge test";
+
+			const string mappingString = @"<sfmMapping version='6.1'>
+<settings>
+<meaning app='fw.sil.org'/>
+</settings>
+<languages>
+<langDef id='English' xml:lang='en'/>
+<langDef id='Vernacular' xml:lang='fr'/>
+</languages>
+<hierarchy>
+<level name='Entry' partOf='records' beginFields='lx'/>
+<level name='Sense' partOf='Entry' beginFields='ge ps'/>
+</hierarchy>
+<fieldDescriptions>
+<field sfm='lx' name='Lexeme Form' type='string' lang='Vernacular'/>
+<field sfm='ps' name='Category' type='string' lang='English'/>
+<field sfm='ge' name='Gloss' type='string' lang='English'/>
+</fieldDescriptions>
+</sfmMapping>";
+
+			var sfmFile = Path.GetTempFileName();
+			var mappingFile = Path.GetTempFileName();
+			var outputFile = Path.GetTempFileName();
+
+			// SFM files are read as UTF-8 by the importer.
+			File.WriteAllText(sfmFile, sfmString, new UTF8Encoding(false));
+			File.WriteAllText(mappingFile, mappingString);
+
+			var converter = new Converter(null);
+			converter.Convert(sfmFile, mappingFile, outputFile);
+
+			var doc = new XmlDocument();
+			doc.Load(outputFile);
+
+			var lexemeNode = doc.SelectSingleNode("//lx | //LexemeForm | //Lexeme");
+			Assert.NotNull(lexemeNode, "Lexeme node was not found in output XML");
+
+			// The supplementary characters must survive the import unchanged
+			// (NFD normalization leaves these code points untouched).
+			Assert.AreEqual(supplementary, lexemeNode.InnerText,
+				"Supplementary-plane characters were not preserved during SFM import");
+		}
+
 		private static bool IsNfd(string s)
 		{
 			return s == s.Normalize(NormalizationForm.FormD);