4141
4242package com .oracle .graal .python .parser .sst ;
4343
44- import java .util .HashMap ;
4544import java .util .Locale ;
46- import java .util .Map ;
4745
4846import com .ibm .icu .lang .UCharacter ;
4947import com .oracle .graal .python .builtins .PythonBuiltinClassType ;
5755import com .oracle .graal .python .runtime .PythonParser .ParserErrorCallback ;
5856import com .oracle .graal .python .runtime .exception .PException ;
5957import com .oracle .graal .python .util .PythonUtils ;
60- import com .oracle .truffle .api .CompilerDirectives ;
58+ import com .oracle .truffle .api .CompilerDirectives . TruffleBoundary ;
6159import com .oracle .truffle .api .source .Source ;
60+ import com .oracle .truffle .regex .chardata .UnicodeCharacterAliases ;
6261
6362public class StringUtils {
6463
@@ -263,7 +262,7 @@ private static PException createTruncatedError(String text, int startIndex, int
263262 * @param offset this is offset of the open brace
264263 * @return offset of the close brace
265264 */
266- @ CompilerDirectives . TruffleBoundary
265+ @ TruffleBoundary
267266 private static int doCharacterName (String text , StringBuilder sb , int offset ) {
268267 if (offset >= text .length ()) {
269268 throw PConstructAndRaiseNode .raiseUncachedUnicodeDecodeError ("unicodeescape" , text , offset - 2 , offset , MALFORMED_ERROR );
@@ -288,61 +287,21 @@ private static int doCharacterName(String text, StringBuilder sb, int offset) {
288287 return closeIndex ;
289288 }
290289
291- // ICU4J doesn't have names for most control characters
292- private static final Map <String , Integer > CONTROL_CHAR_NAMES = new HashMap <>(32 );
293- static {
294- CONTROL_CHAR_NAMES .put ("NULL" , 0x0000 );
295- CONTROL_CHAR_NAMES .put ("START OF HEADING" , 0x0001 );
296- CONTROL_CHAR_NAMES .put ("START OF TEXT" , 0x0002 );
297- CONTROL_CHAR_NAMES .put ("END OF TEXT" , 0x0003 );
298- CONTROL_CHAR_NAMES .put ("END OF TRANSMISSION" , 0x0004 );
299- CONTROL_CHAR_NAMES .put ("ENQUIRY" , 0x0005 );
300- CONTROL_CHAR_NAMES .put ("ACKNOWLEDGE" , 0x0006 );
301- CONTROL_CHAR_NAMES .put ("BELL" , 0x0007 );
302- CONTROL_CHAR_NAMES .put ("BACKSPACE" , 0x0008 );
303- CONTROL_CHAR_NAMES .put ("CHARACTER TABULATION" , 0x0009 );
304- CONTROL_CHAR_NAMES .put ("LINE FEED" , 0x000A );
305- CONTROL_CHAR_NAMES .put ("LINE TABULATION" , 0x000B );
306- CONTROL_CHAR_NAMES .put ("FORM FEED" , 0x000C );
307- CONTROL_CHAR_NAMES .put ("CARRIAGE RETURN" , 0x000D );
308- CONTROL_CHAR_NAMES .put ("SHIFT OUT" , 0x000E );
309- CONTROL_CHAR_NAMES .put ("SHIFT IN" , 0x000F );
310- CONTROL_CHAR_NAMES .put ("DATA LINK ESCAPE" , 0x0010 );
311- CONTROL_CHAR_NAMES .put ("DEVICE CONTROL ONE" , 0x0011 );
312- CONTROL_CHAR_NAMES .put ("DEVICE CONTROL TWO" , 0x0012 );
313- CONTROL_CHAR_NAMES .put ("DEVICE CONTROL THREE" , 0x0013 );
314- CONTROL_CHAR_NAMES .put ("DEVICE CONTROL FOUR" , 0x0014 );
315- CONTROL_CHAR_NAMES .put ("NEGATIVE ACKNOWLEDGE" , 0x0015 );
316- CONTROL_CHAR_NAMES .put ("SYNCHRONOUS IDLE" , 0x0016 );
317- CONTROL_CHAR_NAMES .put ("END OF TRANSMISSION BLOCK" , 0x0017 );
318- CONTROL_CHAR_NAMES .put ("CANCEL" , 0x0018 );
319- CONTROL_CHAR_NAMES .put ("END OF MEDIUM" , 0x0019 );
320- CONTROL_CHAR_NAMES .put ("SUBSTITUTE" , 0x001A );
321- CONTROL_CHAR_NAMES .put ("ESCAPE" , 0x001B );
322- CONTROL_CHAR_NAMES .put ("INFORMATION SEPARATOR FOUR" , 0x001C );
323- CONTROL_CHAR_NAMES .put ("INFORMATION SEPARATOR THREE" , 0x001D );
324- CONTROL_CHAR_NAMES .put ("INFORMATION SEPARATOR TWO" , 0x001E );
325- CONTROL_CHAR_NAMES .put ("INFORMATION SEPARATOR ONE" , 0x001F );
326- }
327-
328- @ CompilerDirectives .TruffleBoundary
329- public static int getCodePoint (String charName ) {
330- int possibleChar = UCharacter .getCharFromName (charName );
331- if (possibleChar > -1 ) {
332- return possibleChar ;
333- }
334- possibleChar = UCharacter .getCharFromExtendedName (charName );
335- if (possibleChar > -1 ) {
336- return possibleChar ;
337- }
338- possibleChar = UCharacter .getCharFromNameAlias (charName );
339- if (possibleChar > -1 ) {
340- return possibleChar ;
341- }
342- possibleChar = CONTROL_CHAR_NAMES .getOrDefault (charName .toUpperCase (Locale .ROOT ), -1 );
343- if (possibleChar > -1 ) {
344- return possibleChar ;
290+ @ TruffleBoundary
291+ public static int getCodePoint (String characterName ) {
292+ // CPython's logic for resolving these character names goes like this:
293+ // 1) handle Hangul Syllables in region AC00-D7A3
294+ // 2) handle CJK Ideographs
295+ // 3) handle character names as given in UnicodeData.txt
296+ // 4) handle all aliases as given in NameAliases.txt
297+ // With ICU's UCharacter, we get cases 1), 2) and 3). As for 4), the aliases, ICU only
298+ // handles aliases of type 'correction'. Therefore, we extract the contents of
299+ // NameAliases.txt and handle aliases by ourselves.
300+ String normalizedName = characterName .trim ().toUpperCase (Locale .ROOT );
301+ if (UnicodeCharacterAliases .CHARACTER_ALIASES .containsKey (normalizedName )) {
302+ return UnicodeCharacterAliases .CHARACTER_ALIASES .get (normalizedName );
303+ } else {
304+ return UCharacter .getCharFromName (characterName );
345305 }
346- return -1 ;
347306 }
348307}
0 commit comments