diff --git a/makeunicodedata.py b/makeunicodedata.py index 4d22db8..89ca18e 100644 --- a/makeunicodedata.py +++ b/makeunicodedata.py @@ -99,19 +99,21 @@ CASED_MASK = 0x2000 EXTENDED_CASE_MASK = 0x4000 -# these ranges need to match unicodedata.c:is_unified_ideograph +# CJK Unified Ideograph ranges. +# makeunicodecjk() generates unicodedata_cjk.h from these, which is +# included by unicodedata.c (is_unified_ideograph function). cjk_ranges = [ - ('3400', '4DBF'), # CJK Ideograph Extension A CJK - ('4E00', '9FFF'), # CJK Ideograph - ('20000', '2A6DF'), # CJK Ideograph Extension B - ('2A700', '2B73F'), # CJK Ideograph Extension C - ('2B740', '2B81D'), # CJK Ideograph Extension D - ('2B820', '2CEAD'), # CJK Ideograph Extension E - ('2CEB0', '2EBE0'), # CJK Ideograph Extension F - ('2EBF0', '2EE5D'), # CJK Ideograph Extension I - ('30000', '3134A'), # CJK Ideograph Extension G - ('31350', '323AF'), # CJK Ideograph Extension H - ('323B0', '33479'), # CJK Ideograph Extension J + ('3400', '4DBF', 'CJK Ideograph Extension A'), + ('4E00', '9FFF', 'CJK Ideograph'), + ('20000', '2A6DF', 'CJK Ideograph Extension B'), + ('2A700', '2B73F', 'CJK Ideograph Extension C'), + ('2B740', '2B81D', 'CJK Ideograph Extension D'), + ('2B820', '2CEAD', 'CJK Ideograph Extension E'), + ('2CEB0', '2EBE0', 'CJK Ideograph Extension F'), + ('2EBF0', '2EE5D', 'CJK Ideograph Extension I'), + ('30000', '3134A', 'CJK Ideograph Extension G'), + ('31350', '323AF', 'CJK Ideograph Extension H'), + ('323B0', '33479', 'CJK Ideograph Extension J'), ] @@ -129,11 +131,43 @@ def maketables(trace=0): print(len(list(filter(None, old_unicode.table))), "characters") merge_old_version(version, unicode, old_unicode) + makeunicodecjk(trace) makeunicodename(unicode, trace) makeunicodedata(unicode, trace) makeunicodetype(unicode, trace) +# -------------------------------------------------------------------- +# CJK Unified Ideograph ranges (is_unified_ideograph function) + +def makeunicodecjk(trace): + + FILE = "unicodedata2/unicodedata_cjk.h" + + print("--- Preparing", FILE, "...") + + with open(FILE, "w") as fp: + fprint = partial(print, file=fp) + fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) + fprint() + fprint("static int") + fprint("is_unified_ideograph(Py_UCS4 code)") + fprint("{") + fprint(" return") + for i, (start, end, name) in enumerate(cjk_ranges): + start_hex = int(start, 16) + end_hex = int(end, 16) + if i < len(cjk_ranges) - 1: + fprint(" (0x%X <= code && code <= 0x%X) || /* %s */" + % (start_hex, end_hex, name)) + else: + fprint(" (0x%X <= code && code <= 0x%X); /* %s */" + % (start_hex, end_hex, name)) + fprint("}") + + print(len(cjk_ranges), "CJK ranges") + + # -------------------------------------------------------------------- # unicode character properties @@ -1056,8 +1090,10 @@ def __init__(self, version, cjk_check=True): field = None elif field: table[i] = from_row(('%X' % i,) + field[1:]) - if cjk_check and cjk_ranges != cjk_ranges_found: - raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found) + if cjk_check: + expected = [(s, e) for s, e, _ in cjk_ranges] + if expected != cjk_ranges_found: + raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found) # public attributes self.filename = UNICODE_DATA % '' diff --git a/tests/test_unicodedata2.py b/tests/test_unicodedata2.py index 8e32e9f..c2382f9 100644 --- a/tests/test_unicodedata2.py +++ b/tests/test_unicodedata2.py @@ -33,7 +33,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): # Update this if the database changes. Make sure to do a full rebuild # (e.g. 'make distclean && make') to get the correct checksum. - expectedchecksum = '2cf81cbeaa7cbc8f1ace57dd6c56d1f30f1a2de1' + expectedchecksum = '65670ae03a324c5f9e826a4de3e25bae4d73c9b7' def test_function_checksum(self): import unicodedata2 @@ -186,6 +186,31 @@ def test_issue29456(self): self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) + def test_cjk_unified_ideograph_names(self): + # Test that is_unified_ideograph covers all CJK ranges by checking + # that name() and lookup() work for the first and last codepoint of + # each range. These ranges must be kept in sync between + # makeunicodedata.py:cjk_ranges and unicodedata_cjk.h. + cjk_ranges = [ + (0x3400, 0x4DBF), # CJK Ideograph Extension A + (0x4E00, 0x9FFF), # CJK Ideograph + (0x20000, 0x2A6DF), # CJK Ideograph Extension B + (0x2A700, 0x2B73F), # CJK Ideograph Extension C + (0x2B740, 0x2B81D), # CJK Ideograph Extension D + (0x2B820, 0x2CEAD), # CJK Ideograph Extension E + (0x2CEB0, 0x2EBE0), # CJK Ideograph Extension F + (0x2EBF0, 0x2EE5D), # CJK Ideograph Extension I + (0x30000, 0x3134A), # CJK Ideograph Extension G + (0x31350, 0x323AF), # CJK Ideograph Extension H + (0x323B0, 0x33479), # CJK Ideograph Extension J + ] + for start, end in cjk_ranges: + for cp in (start, end): + expected_name = "CJK UNIFIED IDEOGRAPH-%X" % cp + char = chr(cp) + self.assertEqual(self.db.name(char), expected_name) + self.assertEqual(self.db.lookup(expected_name), char) + def test_east_asian_width(self): eaw = self.db.east_asian_width self.assertRaises(TypeError, eaw, b'a') diff --git a/unicodedata2/unicodedata.c b/unicodedata2/unicodedata.c index eced542..ec627e2 100644 --- a/unicodedata2/unicodedata.c +++ b/unicodedata2/unicodedata.c @@ -929,22 +929,8 @@ static const char * const hangul_syllables[][3] = { { 0, 0, "H" } }; -/* These ranges need to match makeunicodedata.py:cjk_ranges. */ -static int -is_unified_ideograph(Py_UCS4 code) -{ - return - (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */ - (0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */ - (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */ - (0x2A700 <= code && code <= 0x2B739) || /* CJK Ideograph Extension C */ - (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */ - (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */ - (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */ - (0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */ - (0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */ - (0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */ -} +/* CJK Unified Ideograph ranges, generated by makeunicodedata.py */ +#include "unicodedata_cjk.h" /* macros used to determine if the given code point is in the PUA range that * we are using to store aliases and named sequences */ diff --git a/unicodedata2/unicodedata_cjk.h b/unicodedata2/unicodedata_cjk.h new file mode 100644 index 0000000..14d29a7 --- /dev/null +++ b/unicodedata2/unicodedata_cjk.h @@ -0,0 +1,18 @@ +/* this file was generated by makeunicodedata.py 3.3 */ + +static int +is_unified_ideograph(Py_UCS4 code) +{ + return + (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */ + (0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */ + (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */ + (0x2A700 <= code && code <= 0x2B73F) || /* CJK Ideograph Extension C */ + (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */ + (0x2B820 <= code && code <= 0x2CEAD) || /* CJK Ideograph Extension E */ + (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */ + (0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */ + (0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */ + (0x31350 <= code && code <= 0x323AF) || /* CJK Ideograph Extension H */ + (0x323B0 <= code && code <= 0x33479); /* CJK Ideograph Extension J */ +}