Skip to content

Commit b8e22e0

Browse files
committed
v0.3.9, Separate suffixes that are acronyms to handle periods differently
1 parent 415f1bf commit b8e22e0

File tree

8 files changed

+77
-39
lines changed

8 files changed

+77
-39
lines changed

README.rst

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ the difference between 'title' and 'suffix' is positional, not semantic.
6363
nickname: ''
6464
]>
6565

66-
Most projects will probably need a bit of adjustments for your dataset. You can
66+
Your project may need a bit of adjustments for your dataset. You can
6767
do this in your own pre- or post-processing, by `customizing the configured pre-defined
6868
sets`_ of titles, prefixes, etc., or by subclassing the `HumanName` class. See the
6969
`full documentation`_ for more information.
@@ -98,13 +98,9 @@ simple Heroku-friendly Flask wrapper for this module.
9898
Documentation
9999
-------------
100100

101-
http://nameparser.readthedocs.org/en/latest/
102-
103-
**NOTE:** This documentation covers the new **version 0.3**. For the v0.2.10 documentation,
104-
see the `v0.2.10 tag`_ on GitHub.
105-
106-
.. _v0.2.10 tag: https://github.com/derek73/python-nameparser/tree/v0.2.10
101+
Full documentation
107102

103+
http://nameparser.readthedocs.org/en/latest/
108104

109105

110106
Contributing

docs/release_log.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
Release Log
22
===========
3+
* 0.3.9 - September 2, 2015
4+
- Separate suffixes that are acronyms to handle periods differently, fixes #29, #21
5+
- Don't find titles after first name is filled, fixes (#27)
36
* 0.3.8 - September 2, 2015
47
- Use regex to check for roman numerals at end of name (#36)
58
- Add DVM to suffixes

nameparser/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = (0, 3, 8)
1+
VERSION = (0, 3, 9)
22
__version__ = '.'.join(map(str, VERSION))
33
__author__ = "Derek Gulbranson"
44
__author_email__ = 'derek73@gmail.com'

nameparser/config/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@
3535
from nameparser.config.prefixes import PREFIXES
3636
from nameparser.config.capitalization import CAPITALIZATION_EXCEPTIONS
3737
from nameparser.config.conjunctions import CONJUNCTIONS
38-
from nameparser.config.suffixes import SUFFIXES
38+
from nameparser.config.suffixes import SUFFIXES
39+
from nameparser.config.suffixes import SUFFIX_ACRONYMS
3940
from nameparser.config.titles import TITLES
4041
from nameparser.config.titles import FIRST_NAME_TITLES
4142
from nameparser.config.regexes import REGEXES
@@ -141,6 +142,7 @@ class Constants(object):
141142
def __init__(self,
142143
prefixes=PREFIXES,
143144
suffixes=SUFFIXES,
145+
suffix_acronyms=SUFFIX_ACRONYMS,
144146
titles=TITLES,
145147
first_name_titles=FIRST_NAME_TITLES,
146148
conjunctions=CONJUNCTIONS,
@@ -149,6 +151,7 @@ def __init__(self,
149151
):
150152
self.prefixes = SetManager(prefixes)
151153
self.suffixes = SetManager(suffixes)
154+
self.suffix_acronyms = SetManager(suffix_acronyms)
152155
self.titles = SetManager(titles)
153156
self.first_name_titles = SetManager(first_name_titles)
154157
self.conjunctions = SetManager(conjunctions)

nameparser/config/regexes.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)),
1010
("nickname", re.compile(r'\s*?[\("](.+?)[\)"]', re.U)),
1111
("roman_numeral", re.compile(r'^(X|IX|IV|V?I{0,3})$', re.I | re.U)),
12-
("no_vowels",re.compile(r'^[^aeyiuo]+$', re.I | re.U))
12+
("no_vowels",re.compile(r'^[^aeyiuo]+$', re.I | re.U)),
13+
("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U))
1314
])
1415
"""
1516
All regular expressions used by the parser are precompiled and stored in the config.

nameparser/config/suffixes.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,7 @@
11
# -*- coding: utf-8 -*-
22
from __future__ import unicode_literals
33

4-
SUFFIXES = set([
5-
'esq',
6-
'esquire',
7-
'jr',
8-
'jnr',
9-
'sr',
10-
'snr',
11-
'2',
12-
'i',
13-
'ii',
14-
'iii',
15-
'iv',
16-
'v',
4+
SUFFIX_ACRONYMS = set([
175
'clu',
186
'chfc',
197
'cfp',
@@ -33,6 +21,21 @@
3321
'qc',
3422
'dvm',
3523
])
24+
25+
SUFFIXES = SUFFIX_ACRONYMS | set([
26+
'esq',
27+
'esquire',
28+
'jr',
29+
'jnr',
30+
'sr',
31+
'snr',
32+
'2',
33+
'i',
34+
'ii',
35+
'iii',
36+
'iv',
37+
'v',
38+
])
3639
"""
3740
3841
Pieces that come at the end of the name but are not last names. These potentially

nameparser/parser.py

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,9 @@ def is_roman_numeral(self, value):
281281
def is_suffix(self, piece):
282282
"""Is in the suffixes set and not :py:func:`is_an_initial()`."""
283283
# suffixes may have periods inside them like "M.D."
284-
return lc(piece).replace('.','') in self.C.suffixes and not self.is_an_initial(piece)
284+
return ((lc(piece).replace('.','') in self.C.suffix_acronyms) \
285+
or (lc(piece) in self.C.suffixes)) \
286+
and not self.is_an_initial(piece)
285287

286288
def are_suffixes(self, pieces):
287289
"""Return True if all pieces are suffixes."""
@@ -304,9 +306,6 @@ def is_an_initial(self, value):
304306
"""
305307
return bool(self.C.regexes.initial.match(value))
306308

307-
# def is_a_roman_numeral(value):
308-
# return re_roman_numeral.match(value) or False
309-
310309

311310
### full_name parser
312311

@@ -412,7 +411,7 @@ def parse_full_name(self):
412411
nxt = None
413412

414413
# title must have a next piece, unless it's just a title
415-
if self.is_title(piece) and (nxt or p_len == 1):
414+
if self.is_title(piece) and (nxt or p_len == 1) and not self.first:
416415
self.title_list.append(piece)
417416
continue
418417
if not self.first:
@@ -446,7 +445,7 @@ def parse_full_name(self):
446445
except IndexError:
447446
nxt = None
448447

449-
if self.is_title(piece) and (nxt or len(pieces) == 1):
448+
if self.is_title(piece) and (nxt or len(pieces) == 1) and not self.first:
450449
self.title_list.append(piece)
451450
continue
452451
if not self.first:
@@ -483,7 +482,7 @@ def parse_full_name(self):
483482
except IndexError:
484483
nxt = None
485484

486-
if self.is_title(piece) and (nxt or len(pieces) == 1):
485+
if self.is_title(piece) and (nxt or len(pieces) == 1) and not self.first:
487486
self.title_list.append(piece)
488487
continue
489488
if not self.first:
@@ -522,7 +521,9 @@ def parse_full_name(self):
522521
def parse_pieces(self, parts, additional_parts_count=0):
523522
"""
524523
Split parts on spaces and remove commas, join on conjunctions and
525-
lastname prefixes.
524+
lastname prefixes. If parts have periods in the middle, try splitting
525+
on periods and check if the parts are titles or suffixes. If they are
526+
add to the constant so they will be found.
526527
527528
:param list parts: name part strings from the comma split
528529
:param int additional_parts_count:
@@ -533,12 +534,31 @@ def parse_pieces(self, parts, additional_parts_count=0):
533534
:rtype: list
534535
"""
535536

536-
tmp = []
537+
output = []
537538
for part in parts:
538539
if not isinstance(part, text_types):
539540
raise TypeError("Name parts must be strings. Got {0}".format(type(part)))
540-
tmp += [x.strip(' ,') for x in part.split(' ')]
541-
return self.join_on_conjunctions(tmp, additional_parts_count)
541+
output += [x.strip(' ,') for x in part.split(' ')]
542+
543+
# If there's periods, check if it's titles without spaces and add spaces
544+
# so they get picked up later as titles.
545+
for part in output:
546+
# if this part has a period not at the beginning or end
547+
if self.C.regexes.period_not_at_end.match(part):
548+
# split on periods, any of the split pieces titles or suffixes? ("Lt.Gov.")
549+
period_chunks = part.split(".")
550+
titles = filter(self.is_title, period_chunks)
551+
suffixes = filter(self.is_suffix, period_chunks)
552+
553+
# add the part to the constant so it will be found
554+
if len(list(titles)):
555+
self.C.titles.add(part)
556+
continue
557+
if len(list(suffixes)):
558+
self.C.suffixes.add(part)
559+
continue
560+
561+
return self.join_on_conjunctions(output, additional_parts_count)
542562

543563
def join_on_conjunctions(self, pieces, additional_parts_count=0):
544564
"""

tests.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class HumanNameTestBase(unittest.TestCase):
3939
def m(self, actual, expected, hn):
4040
"""assertEquals with a better message"""
4141
try:
42-
self.assertEqual(actual, expected, "'%s' != '%s' for '%s'\n%s" % (
42+
self.assertEqual(actual, expected, "'%s' != '%s' for '%s'\n%r" % (
4343
actual,
4444
expected,
4545
hn.full_name,
@@ -1464,7 +1464,7 @@ def test_king(self):
14641464
self.m(hn.last, "King", hn)
14651465
self.m(hn.suffix, "Jr", hn)
14661466

1467-
class HumanNameTitleTestCase(HumanNameTestBase):
1467+
class TitleTestCase(HumanNameTestBase):
14681468

14691469
def test_last_name_is_also_title(self):
14701470
hn = HumanName("Amy E Maid")
@@ -1572,12 +1572,24 @@ def test_title_with_last_initial_is_suffix(self):
15721572
self.m(hn.title, "King", hn)
15731573
self.m(hn.first, "John", hn)
15741574
self.m(hn.last, "V.", hn)
1575+
1576+
def test_last_name_is_also_title(self):
1577+
hn = HumanName("Dr. Martin Luther King Jr.")
1578+
self.m(hn.title, "Dr.", hn)
1579+
self.m(hn.first, "Martin", hn)
1580+
self.m(hn.middle, "Luther", hn)
1581+
self.m(hn.last, "King", hn)
1582+
self.m(hn.suffix, "Jr.", hn)
1583+
1584+
def test_initials_also_suffix(self):
1585+
hn = HumanName("Smith, J.R.")
1586+
self.m(hn.first, "J.R.", hn)
1587+
# self.m(hn.middle, "R.", hn)
1588+
self.m(hn.last, "Smith", hn)
15751589

1576-
@unittest.expectedFailure
15771590
def test_two_title_parts_separated_by_commas(self):
1578-
# supporting this currently messes up supporting suffixes like M.B.A.
15791591
hn = HumanName("Lt.Gen. John A. Kenneth Doe IV")
1580-
self.m(hn.title, "Lt. Gen.", hn)
1592+
self.m(hn.title, "Lt.Gen.", hn)
15811593
self.m(hn.first, "John", hn)
15821594
self.m(hn.last, "Doe", hn)
15831595
self.m(hn.middle, "A. Kenneth", hn)

0 commit comments

Comments
 (0)