v0.3.9, Separate suffixes that are acronyms to handle periods differently

derek73 · derek73 · commit b8e22e095445 · 2015-09-03T01:21:52.000-07:00
diff --git a/README.rst b/README.rst
@@ -63,7 +63,7 @@ the difference between 'title' and 'suffix' is positional, not semantic.
     	nickname: ''
     ]>
 
-Most projects will probably need a bit of adjustments for your dataset. You can
+Your project may need a bit of adjustments for your dataset. You can
 do this in your own pre- or post-processing, by `customizing the configured pre-defined 
 sets`_ of titles, prefixes, etc., or by subclassing the `HumanName` class. See the 
 `full documentation`_ for more information.
@@ -98,13 +98,9 @@ simple Heroku-friendly Flask wrapper for this module.
 Documentation
 -------------
 
-http://nameparser.readthedocs.org/en/latest/
-
-**NOTE:** This documentation covers the new **version 0.3**. For the v0.2.10 documentation,
-see the `v0.2.10 tag`_ on GitHub.
-
-.. _v0.2.10 tag: https://github.com/derek73/python-nameparser/tree/v0.2.10
+Full documentation
 
+http://nameparser.readthedocs.org/en/latest/
 
 
 Contributing
diff --git a/docs/release_log.rst b/docs/release_log.rst
@@ -1,5 +1,8 @@
 Release Log
 ===========
+* 0.3.9 - September 2, 2015
+    - Separate suffixes that are acronyms to handle periods differently, fixes #29, #21
+    - Don't find titles after first name is filled, fixes (#27)
 * 0.3.8 - September 2, 2015
     - Use regex to check for roman numerals at end of name (#36)
     - Add DVM to suffixes
diff --git a/nameparser/__init__.py b/nameparser/__init__.py
@@ -1,4 +1,4 @@
-VERSION = (0, 3, 8)
+VERSION = (0, 3, 9)
 __version__ = '.'.join(map(str, VERSION))
 __author__ = "Derek Gulbranson"
 __author_email__ = 'derek73@gmail.com'
diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py
@@ -35,7 +35,8 @@
 from nameparser.config.prefixes import PREFIXES
 from nameparser.config.capitalization import CAPITALIZATION_EXCEPTIONS
 from nameparser.config.conjunctions import CONJUNCTIONS
-from nameparser.config.suffixes import SUFFIXES
+from nameparser.config.suffixes import SUFFIXES 
+from nameparser.config.suffixes import SUFFIX_ACRONYMS
 from nameparser.config.titles import TITLES
 from nameparser.config.titles import FIRST_NAME_TITLES
 from nameparser.config.regexes import REGEXES
@@ -141,6 +142,7 @@ class Constants(object):
     def __init__(self, 
                     prefixes=PREFIXES, 
                     suffixes=SUFFIXES,
+                    suffix_acronyms=SUFFIX_ACRONYMS,
                     titles=TITLES,
                     first_name_titles=FIRST_NAME_TITLES,
                     conjunctions=CONJUNCTIONS,
@@ -149,6 +151,7 @@ def __init__(self,
                 ):
         self.prefixes          = SetManager(prefixes)
         self.suffixes          = SetManager(suffixes)
+        self.suffix_acronyms   = SetManager(suffix_acronyms)
         self.titles            = SetManager(titles)
         self.first_name_titles = SetManager(first_name_titles)
         self.conjunctions      = SetManager(conjunctions)
diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py
@@ -9,7 +9,8 @@
     ("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)),
     ("nickname", re.compile(r'\s*?[\("](.+?)[\)"]', re.U)),
     ("roman_numeral", re.compile(r'^(X|IX|IV|V?I{0,3})$', re.I | re.U)),
-    ("no_vowels",re.compile(r'^[^aeyiuo]+$', re.I | re.U))
+    ("no_vowels",re.compile(r'^[^aeyiuo]+$', re.I | re.U)),
+    ("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U))
 ])
 """
 All regular expressions used by the parser are precompiled and stored in the config.
diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py
@@ -1,19 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-SUFFIXES = set([
-    'esq',
-    'esquire',
-    'jr',
-    'jnr',
-    'sr',
-    'snr',
-    '2',
-    'i',
-    'ii',
-    'iii',
-    'iv',
-    'v',
+SUFFIX_ACRONYMS = set([
     'clu',
     'chfc',
     'cfp',
@@ -33,6 +21,21 @@
     'qc',
     'dvm',
 ])
+
+SUFFIXES = SUFFIX_ACRONYMS | set([
+    'esq',
+    'esquire',
+    'jr',
+    'jnr',
+    'sr',
+    'snr',
+    '2',
+    'i',
+    'ii',
+    'iii',
+    'iv',
+    'v',
+])
 """
 
 Pieces that come at the end of the name but are not last names. These potentially
diff --git a/nameparser/parser.py b/nameparser/parser.py
@@ -281,7 +281,9 @@ def is_roman_numeral(self, value):
     def is_suffix(self, piece):
         """Is in the suffixes set and not :py:func:`is_an_initial()`."""
         # suffixes may have periods inside them like "M.D."
-        return lc(piece).replace('.','') in self.C.suffixes and not self.is_an_initial(piece)
+        return ((lc(piece).replace('.','') in self.C.suffix_acronyms) \
+            or (lc(piece) in self.C.suffixes)) \
+            and not self.is_an_initial(piece)
     
     def are_suffixes(self, pieces):
         """Return True if all pieces are suffixes."""
@@ -304,9 +306,6 @@ def is_an_initial(self, value):
         """
         return bool(self.C.regexes.initial.match(value))
 
-    # def is_a_roman_numeral(value):
-    #     return re_roman_numeral.match(value) or False
-
     
     ### full_name parser
     
@@ -412,7 +411,7 @@ def parse_full_name(self):
                     nxt = None
                 
                 # title must have a next piece, unless it's just a title
-                if self.is_title(piece) and (nxt or p_len == 1):
+                if self.is_title(piece) and (nxt or p_len == 1) and not self.first:
                     self.title_list.append(piece)
                     continue
                 if not self.first:
@@ -446,7 +445,7 @@ def parse_full_name(self):
                     except IndexError:
                         nxt = None
 
-                    if self.is_title(piece) and (nxt or len(pieces) == 1):
+                    if self.is_title(piece) and (nxt or len(pieces) == 1) and not self.first:
                         self.title_list.append(piece)
                         continue
                     if not self.first:
@@ -483,7 +482,7 @@ def parse_full_name(self):
                     except IndexError:
                         nxt = None
                     
-                    if self.is_title(piece) and (nxt or len(pieces) == 1):
+                    if self.is_title(piece) and (nxt or len(pieces) == 1) and not self.first:
                         self.title_list.append(piece)
                         continue
                     if not self.first:
@@ -522,7 +521,9 @@ def parse_full_name(self):
     def parse_pieces(self, parts, additional_parts_count=0):
         """
         Split parts on spaces and remove commas, join on conjunctions and
-        lastname prefixes.
+        lastname prefixes. If parts have periods in the middle, try splitting
+        on periods and check if the parts are titles or suffixes. If they are
+        add to the constant so they will be found.
         
         :param list parts: name part strings from the comma split
         :param int additional_parts_count: 
@@ -533,12 +534,31 @@ def parse_pieces(self, parts, additional_parts_count=0):
         :rtype: list
         """
         
-        tmp = []
+        output = []
         for part in parts:
             if not isinstance(part, text_types):
                 raise TypeError("Name parts must be strings. Got {0}".format(type(part)))
-            tmp += [x.strip(' ,') for x in part.split(' ')]
-        return self.join_on_conjunctions(tmp, additional_parts_count)
+            output += [x.strip(' ,') for x in part.split(' ')]
+        
+        # If there's periods, check if it's titles without spaces and add spaces
+        # so they get picked up later as titles.
+        for part in output:
+            # if this part has a period not at the beginning or end
+            if self.C.regexes.period_not_at_end.match(part):
+                # split on periods, any of the split pieces titles or suffixes? ("Lt.Gov.")
+                period_chunks = part.split(".")
+                titles   = filter(self.is_title,  period_chunks)
+                suffixes = filter(self.is_suffix, period_chunks)
+                
+                # add the part to the constant so it will be found
+                if len(list(titles)):
+                    self.C.titles.add(part)
+                    continue
+                if len(list(suffixes)):
+                    self.C.suffixes.add(part)
+                    continue
+        
+        return self.join_on_conjunctions(output, additional_parts_count)
         
     def join_on_conjunctions(self, pieces, additional_parts_count=0):
         """
diff --git a/tests.py b/tests.py
@@ -39,7 +39,7 @@ class HumanNameTestBase(unittest.TestCase):
     def m(self, actual, expected, hn):
         """assertEquals with a better message"""
         try:
-            self.assertEqual(actual, expected, "'%s' != '%s' for '%s'\n%s" % (
+            self.assertEqual(actual, expected, "'%s' != '%s' for '%s'\n%r" % (
                 actual,
                 expected,
                 hn.full_name,
@@ -1464,7 +1464,7 @@ def test_king(self):
         self.m(hn.last, "King", hn)
         self.m(hn.suffix, "Jr", hn)
 
-class HumanNameTitleTestCase(HumanNameTestBase):
+class TitleTestCase(HumanNameTestBase):
 
     def test_last_name_is_also_title(self):
         hn = HumanName("Amy E Maid")
@@ -1572,12 +1572,24 @@ def test_title_with_last_initial_is_suffix(self):
         self.m(hn.title, "King", hn)
         self.m(hn.first, "John", hn)
         self.m(hn.last, "V.", hn)
+        
+    def test_last_name_is_also_title(self):
+        hn = HumanName("Dr. Martin Luther King Jr.")
+        self.m(hn.title, "Dr.", hn)
+        self.m(hn.first, "Martin", hn)
+        self.m(hn.middle, "Luther", hn)
+        self.m(hn.last, "King", hn)
+        self.m(hn.suffix, "Jr.", hn)
+
+    def test_initials_also_suffix(self):
+        hn = HumanName("Smith, J.R.")
+        self.m(hn.first, "J.R.", hn)
+        # self.m(hn.middle, "R.", hn)
+        self.m(hn.last, "Smith", hn)
 
-    @unittest.expectedFailure
     def test_two_title_parts_separated_by_commas(self):
-        # supporting this currently messes up supporting suffixes like M.B.A.
         hn = HumanName("Lt.Gen. John A. Kenneth Doe IV")
-        self.m(hn.title, "Lt. Gen.", hn)
+        self.m(hn.title, "Lt.Gen.", hn)
         self.m(hn.first, "John", hn)
         self.m(hn.last, "Doe", hn)
         self.m(hn.middle, "A. Kenneth", hn)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-VERSION = (0, 3, 8)`
	`1`	`+VERSION = (0, 3, 9)`
`2`	`2`	`__version__ = '.'.join(map(str, VERSION))`
`3`	`3`	`__author__ = "Derek Gulbranson"`
`4`	`4`	`__author_email__ = 'derek73@gmail.com'`