Skip to content

Commit 445c125

Browse files
gh-62259: Add support of multi-byte encodings in the XML parser
Supported encodings: "cp932", "cp949", "cp950", "Big5","EUC-JP", "GB2312", "GBK", "johab", and "Shift_JIS". Partially supported encodings (only BMP characters): "Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213", "Shift_JIS-2004", "Shift_JISX0213", "utf-8-sig" and non-standard aliases like "UTF8" (without hyphen). The parser now raises ValueError for known unsupported multi-byte encodings such us "ISO-2022-JP" or "raw-unicode-escape" instead of failing later, when encounter non-ASCII data.
1 parent c6f7368 commit 445c125

46 files changed

Lines changed: 374 additions & 21 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Doc/whatsnew/3.16.rst

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,20 @@ New modules
8686
Improved modules
8787
================
8888

89-
module_name
90-
-----------
89+
xml
90+
---
9191

92-
* TODO
92+
* Add support of multiple multi-byte encodings in the :mod:`XML parser
93+
<xml.parsers.expat>`: "cp932", "cp949", "cp950", "Big5","EUC-JP",
94+
"GB2312", "GBK", "johab", and "Shift_JIS".
95+
Add partial support (only BMP characters) for multi-byte encodings
96+
"Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213", "Shift_JIS-2004",
97+
"Shift_JISX0213", "utf-8-sig" and non-standard aliases like "UTF8"
98+
(without hyphen).
99+
The parser now raises :exc:`ValueError` for known unsupported
100+
multi-byte encodings such us "ISO-2022-JP" or "raw-unicode-escape"
101+
instead of failing later, when encounter non-ASCII data.
102+
(Contributed by Serhiy Storchaka in :gh:`62259`.)
93103

94104
.. Add improved modules above alphabetically, not here at the end.
95105

Include/internal/pycore_codecs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ extern int _PyCodec_UnregisterError(const char *name);
4545
in Python 3.5+?
4646
4747
*/
48-
extern PyObject* _PyCodec_LookupTextEncoding(
48+
PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding(
4949
const char *encoding,
5050
const char *alternate_command);
5151

Lib/codecs.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ class CodecInfo(tuple):
9393

9494
def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
9595
incrementalencoder=None, incrementaldecoder=None, name=None,
96-
*, _is_text_encoding=None):
96+
*, _is_text_encoding=None, _expat_decoding_table=None):
9797
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
9898
self.name = name
9999
self.encode = encode
@@ -104,6 +104,8 @@ def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
104104
self.streamreader = streamreader
105105
if _is_text_encoding is not None:
106106
self._is_text_encoding = _is_text_encoding
107+
if _expat_decoding_table is not None:
108+
self._expat_decoding_table = _expat_decoding_table
107109
return self
108110

109111
def __repr__(self):

Lib/encodings/big5.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,13 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
41+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
42+
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
43+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44+
-2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2,
45+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
3948
)

Lib/encodings/big5hkscs.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,13 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
-1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2,
41+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
42+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
43+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
45+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
3948
)

Lib/encodings/cp932.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,18 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
0x80, -2, -2, -2, -2, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2,
41+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
42+
0xf8f0, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
43+
0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
44+
0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
45+
0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
46+
0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
47+
0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
48+
0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
49+
0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
50+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1,
51+
-2, -2, -2, -2, -2, -2, -2, -2,
52+
-2, -2, -1, -1, -1, 0xf8f1, 0xf8f2, 0xf8f3),
3953
)

Lib/encodings/cp949.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,13 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
41+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
42+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
43+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2,
45+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1),
3948
)

Lib/encodings/cp950.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,13 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
41+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
42+
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
43+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44+
-2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2,
45+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
3948
)

Lib/encodings/euc_jis_2004.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,13 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -3,
41+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
42+
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
43+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
45+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
3948
)

Lib/encodings/euc_jisx0213.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,13 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -3,
41+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
42+
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
43+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
45+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
3948
)

0 commit comments

Comments
 (0)