-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmakeunicode.py
More file actions
93 lines (78 loc) · 2.67 KB
/
Copy pathmakeunicode.py
File metadata and controls
93 lines (78 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!env python3
# prettysharp
# Copyright (C) 2019 John Doty
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""A helper script to create the unicode classification functions found in
unicode.c based on the official public unicode data.
"""
import urllib.request
UNICODE_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
letter_character = {"Lu", "Ll", "Lt", "Lm", "Lo", "Nl"}
combining_character = {"Mn", "Mc"}
connecting_character = {"Pc"}
decimal_digit_character = {"Nd"}
formatting_character = {"Cf"}
id_start_categories = letter_character
id_start_ranges = []
id_part_categories = (
letter_character
| decimal_digit_character
| connecting_character
| combining_character
| formatting_character
)
id_part_ranges = []
def add_range(id_range, code):
if len(id_range) == 0:
id_range.append((code, code))
else:
last_range = id_range[-1]
if code == last_range[1] + 1:
id_range[-1] = (last_range[0], code)
else:
id_range.append((code, code))
text = urllib.request.urlopen(UNICODE_URL).read().decode("utf-8")
for line in text.splitlines():
parts = line.split(";")
code = int(parts[0], 16)
name = parts[1]
category = parts[2]
if category in id_start_categories:
add_range(id_start_ranges, code)
if category in id_part_categories:
add_range(id_part_ranges, code)
print("bool is_identifier_start_rune(uint32_t rune) {")
print(" // Any code point in {}, or '_'".format(", ".join(id_start_categories)))
print(
" return rune == '_' ||\n {};".format(
" ||\n ".join(
["(rune >= {} && rune <= {})".format(r[0], r[1]) for r in id_start_ranges]
)
)
)
print("}")
print()
print("bool is_identifier_part_rune(uint32_t rune) {")
print(" // Any code point in {}".format(", ".join(id_part_categories)))
print(
" return {};".format(
" ||\n ".join(
["(rune >= {} && rune <= {})".format(r[0], r[1]) for r in id_part_ranges]
)
)
)
print("}")
# for r in id_start_ranges:
# print("[{}, {}]".format(hex(r[0]), hex(r[1])))