Skip to content

Commit 4386e3a

Browse files
gh-88726: Stop using non-standard charset names eucgb2312_cn and big5_tw in email
1 parent acefff9 commit 4386e3a

3 files changed

Lines changed: 58 additions & 2 deletions

File tree

Lib/email/charset.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,6 @@
9393

9494
# Map charsets to their Unicode codec strings.
9595
CODEC_MAP = {
96-
'gb2312': 'eucgb2312_cn',
97-
'big5': 'big5_tw',
9896
# Hack: We don't want *any* conversion for stuff marked us-ascii, as all
9997
# sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
10098
# Let that stuff pass through without conversion to/from Unicode.

Lib/test/test_email/test_asian_codecs.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,62 @@ def test_japanese_codecs(self):
5858
# TK: full decode comparison
5959
eq(str(h).encode(jcode), subject_bytes)
6060

61+
h = Header("Japanese")
62+
s = '\u65e5\u672c\u8a9e' # 日本語
63+
h.append(s, Charset('euc-jp'))
64+
h.append(s, Charset('iso-2022-jp'))
65+
h.append(s, Charset('shift_jis'))
66+
eq(h.encode(), """\
67+
Japanese =?iso-2022-jp?b?GyRCRnxLXDhsGyhC?= =?iso-2022-jp?b?GyRCRnxLXDhsGyhC?=
68+
=?iso-2022-jp?b?GyRCRnxLXDhsGyhC?=""")
69+
eq(decode_header(h.encode()),
70+
[(b'Japanese ', None),
71+
(b'\x1b$BF|K\\8l\x1b(B\x1b$BF|K\\8l\x1b(B\x1b$BF|K\\8l\x1b(B', 'iso-2022-jp'),
72+
])
73+
74+
def test_chinese_codecs(self):
75+
eq = self.ndiffAssertEqual
76+
h = Header("Chinese")
77+
s = '\u4e2d\u6587' # 中文
78+
h.append(s, Charset('gb2312'))
79+
h.append(s, Charset('gbk'))
80+
h.append(s, Charset('gb18030'))
81+
h.append(s, Charset('hz'))
82+
h.append(s, Charset('big5'))
83+
h.append(s, Charset('big5hkscs'))
84+
eq(h.encode(), """\
85+
Chinese =?gb2312?b?1tDOxA==?= =?gbk?b?1tDOxA==?= =?gb18030?b?1tDOxA==?=
86+
=?hz?b?fntWUE5Efn0=?= =?big5?b?pKSk5Q==?= =?big5hkscs?b?pKSk5Q==?=""")
87+
eq(decode_header(h.encode()),
88+
[(b'Chinese ', None),
89+
(b'\xd6\xd0\xce\xc4', 'gb2312'),
90+
(b'\xd6\xd0\xce\xc4', 'gbk'),
91+
(b'\xd6\xd0\xce\xc4', 'gb18030'),
92+
(b'~{VPND~}', 'hz'),
93+
(b'\xa4\xa4\xa4\xe5', 'big5'),
94+
(b'\xa4\xa4\xa4\xe5', 'big5hkscs'),
95+
])
96+
97+
def test_korean_codecs(self):
98+
eq = self.ndiffAssertEqual
99+
h = Header("Korean")
100+
s = '\ud55c\uad6d\uc5b4' # 한국어
101+
h.append(s, Charset('euc-kr'))
102+
h.append(s, Charset('ks_c_5601-1987'))
103+
h.append(s, Charset('cp949'))
104+
h.append(s, Charset('iso-2022-kr'))
105+
h.append(s, Charset('johab'))
106+
eq(h.encode(), """\
107+
Korean =?euc-kr?b?x9Gxub7u?= =?ks_c_5601-1987?b?x9Gxub7uIMfRsbm+7g==?=
108+
=?iso-2022-kr?b?GyQpQw5HUTE5Pm4P?= =?johab?b?0GWKgrTh?=""")
109+
eq(decode_header(h.encode()),
110+
[(b'Korean ', None),
111+
(b'\xc7\xd1\xb1\xb9\xbe\xee', 'euc-kr'),
112+
(b'\xc7\xd1\xb1\xb9\xbe\xee \xc7\xd1\xb1\xb9\xbe\xee', 'ks_c_5601-1987'),
113+
(b'\x1b$)C\x0eGQ19>n\x0f', 'iso-2022-kr'),
114+
(b'\xd0e\x8a\x82\xb4\xe1', 'johab'),
115+
])
116+
61117
def test_payload_encoding_utf8(self):
62118
jhello = str(b'\xa5\xcf\xa5\xed\xa1\xbc\xa5\xef\xa1\xbc'
63119
b'\xa5\xeb\xa5\xc9\xa1\xaa', 'euc-jp')
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
The :mod:`email` package now uses standard MIME charset names "gb2312" and
2+
"big5" instead of non-standard names "eucgb2312_cn" and "big5_tw".

0 commit comments

Comments
 (0)