NYPL · aaronfriedman6 · Dec 2, 2024 · Nov 26, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,7 @@
 # Changelog
+## v1.6.1 11/26/24
+- Add record_num capability to patron_data_helper
+
 ## v1.6.0 11/20/24
 - Added patron_data_helper functions
 - Use executemany instead of execute when appropriate in PostgreSQLClient

diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ This package contains common Python utility classes and functions.
 * Creating a logger in the appropriate format
 * Obfuscating a value using bcrypt
 * Parsing/building Research Catalog identifiers
-* Mapping between barcodes and Sierra patron ids plus getting patron data from Sierra and Redshift using those ids
+* Mapping between barcodes and Sierra patron ids plus getting patron data from Sierra and Redshift using those ids or record_nums
 
 ## Usage
 ```python

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "nypl_py_utils"
-version = "1.6.0"
+version = "1.6.1"
 authors = [
   { name="Aaron Friedman", email="aaronfriedman@nypl.org" },
 ]

diff --git a/src/nypl_py_utils/functions/patron_data_helper.py b/src/nypl_py_utils/functions/patron_data_helper.py
@@ -15,12 +15,12 @@
     WHERE index_tag || index_entry IN ({});"""
 
 _SIERRA_PATRON_DATA_QUERY = """
-    SELECT id, barcode, ptype_code, pcode3,
+    SELECT id, record_num, barcode, ptype_code, pcode3,
     CASE WHEN LENGTH(TRIM(home_library_code)) = 0
         OR TRIM(home_library_code) = 'none' THEN NULL
         ELSE TRIM(home_library_code) END
     FROM sierra_view.patron_view
-    WHERE id IN ({});"""
+    WHERE {id_field} IN ({ids});"""
 
 
 def barcodes_to_patron_ids(sierra_client, barcodes, isolate_connection=True,
@@ -73,61 +73,76 @@ def barcodes_to_patron_ids(sierra_client, barcodes, isolate_connection=True,
         return df
 
 
-def get_sierra_patron_data_from_ids(sierra_client, patron_ids,
+def get_sierra_patron_data_from_ids(sierra_client, ids,
                                     isolate_connection=True,
-                                    remove_duplicates=False):
+                                    remove_duplicates=False,
+                                    use_record_num=False):
     """
     Given Sierra patron ids, returns standard patron fields from Sierra
 
     Parameters
     ----------
     sierra_client: PostgreSQLClient
         The client with which to query Sierra
-    patron_ids: sequence of strings
-        The sequence of patron ids to be fetched. Must be iterable and without
-        'None' entries. Each patron id is expected to be a string.
+    ids: sequence of strings
+        The sequence of patron ids or record_nums to be fetched. Must be
+        iterable and without any 'None' entries. Each id is expected to be a
+        string.
     isolate_connection: bool, optional
         Whether the database connection should be opened and closed within this
         method or whether it will be handled by the user
     remove_duplicates: bool, optional
         Whether patron ids that map to multiple rows with different values
         should be removed
+    use_record_num: bool, optional
+        Whether the `ids` given are record_nums rather than patron ids
 
     Returns
     -------
     DataFrame
         A pandas DataFrame with standard patron columns. The 'patron_id' column
         is set to be a string.
     """
-    unique_patron_ids = set(patron_ids)
-    if unique_patron_ids:
+    unique_ids = set(ids)
+    if unique_ids:
         logger.info(
-            f"Fetching Sierra patron data for ({len(unique_patron_ids)}) "
-            "patrons")
-        patron_ids_str = ",".join(unique_patron_ids)
+            f"Fetching Sierra patron data for ({len(unique_ids)}) patrons")
+        id_field = "record_num" if use_record_num else "id"
+        ids_str = ",".join(unique_ids)
         if isolate_connection:
             sierra_client.connect()
         raw_data = sierra_client.execute_query(
-            _SIERRA_PATRON_DATA_QUERY.format(patron_ids_str))
+            _SIERRA_PATRON_DATA_QUERY.format(id_field=id_field, ids=ids_str))
         if isolate_connection:
             sierra_client.close_connection()
     else:
         logger.info("No patron ids given with which to query Sierra")
         raw_data = []
 
     df = pd.DataFrame(raw_data, columns=[
-        "patron_id", "barcode", "ptype_code", "pcode3",
+        "patron_id", "record_num", "barcode", "ptype_code", "pcode3",
         "patron_home_library_code"])
     df = df[pd.notnull(df["patron_id"])]
     df["patron_id"] = df["patron_id"].astype("Int64").astype("string")
-    if remove_duplicates:
+    if use_record_num:
+        df = df[pd.notnull(df["record_num"])]
+        df["record_num"] = df["record_num"].astype("Int32").astype("string")
+
+    if not remove_duplicates:
+        return df.drop_duplicates()
+    elif use_record_num:
         # If one patron id maps to two rows that are identical except for the
         # barcode, arbitrarily delete one of the rows
+        df = df.drop_duplicates(
+            ["patron_id", "record_num", "ptype_code", "pcode3",
+             "patron_home_library_code"])
+        return df.drop_duplicates("record_num", keep=False)
+    else:
+        # If one patron id maps to two rows that are identical except for the
+        # barcode or record_num, arbitrarily delete one of the rows
         df = df.drop_duplicates(
             ["patron_id", "ptype_code", "pcode3", "patron_home_library_code"])
         return df.drop_duplicates("patron_id", keep=False)
-    else:
-        return df.drop_duplicates()
 
 
 def get_sierra_patron_data_from_barcodes(sierra_client, barcodes,
@@ -159,12 +174,12 @@ def get_sierra_patron_data_from_barcodes(sierra_client, barcodes,
     barcode_patron_id_df = barcodes_to_patron_ids(
         sierra_client, barcodes, False, True)
     patron_data_df = get_sierra_patron_data_from_ids(
-        sierra_client, barcode_patron_id_df["patron_id"], False, False)
+        sierra_client, barcode_patron_id_df["patron_id"], False, False, False)
     if isolate_connection:
         sierra_client.close_connection()
 
     # If one patron id maps to two rows that are identical except for the
-    # barcode, arbitrarily delete one of the rows
+    # barcode or record_num, arbitrarily delete one of the rows
     patron_data_df = patron_data_df.drop_duplicates(
         ["patron_id", "ptype_code", "pcode3", "patron_home_library_code"])
 
@@ -181,7 +196,8 @@ def get_sierra_patron_data_from_barcodes(sierra_client, barcodes,
                                       how="left", on="patron_id")
     df = pd.concat([perfect_match_df, imperfect_match_df], ignore_index=True)
     df.loc[df.duplicated("barcode", keep=False), [
-        "ptype_code", "pcode3", "patron_home_library_code"]] = None
+        "record_num", "ptype_code", "pcode3",
+        "patron_home_library_code"]] = None
     return df.drop_duplicates("barcode")
 
 

diff --git a/tests/test_patron_data_helper.py b/tests/test_patron_data_helper.py
@@ -19,36 +19,39 @@
 ]
 
 _TEST_SIERRA_IDS_RESPONSE = [
-    (1, "b1", 11, 12, "aa"), (2, "b2", 21, 22, "bb"),
-    (3, "b3", 31, 32, "cc"), (33, "b3", 331, 332, "ccc"),
-    (4, None, None, None, None), (5, "b5", 51, 52, "dd"),
-    (6, "b6", 61, 62, "ee"), (6, "b66", 61, 62, "ee"),
-    (7, "b7", 71, 72, "ff"), (7, "b77", 771, 772, "ffff"),
-    (None, "b4", None, None, None), (5, "b5", 51, 52, "dd"),
+    (1, "1", "b1", 11, 12, "aa"), (2, "2", "b2", 21, 22, "bb"),
+    (3, "3", "b3", 31, 32, "cc"), (33, "3", "b3", 331, 332, "ccc"),
+    (4, None, None, None, None, None), (5, "5", "b5", 51, 52, "dd"),
+    (6, "6", "b6", 61, 62, "ee"), (6, "6", "b66", 61, 62, "ee"),
+    (7, "7", "b7", 71, 72, "ff"), (7, "77", "b77", 771, 772, "ffff"),
+    (None, "4", "b4", None, None, None), (5, "5", "b5", 51, 52, "dd"),
 ]
 
 _TEST_BARCODE_DF = pd.DataFrame(
-    [[f"b{i}", str(i)] for i in range(1, 10)],
+    [[f"b{i}", str(i)] for i in range(1, 11)],
     columns=["barcode", "patron_id"])
 _TEST_BARCODE_DF["patron_id"] = _TEST_BARCODE_DF["patron_id"].astype("string")
 
 _TEST_ID_DF = pd.DataFrame(
-    [["1", "b1", 11, 12, "aa"],  # one perfect match
-     ["2", "b5", 21, 22, "bb"],  # different id and barcode matches
+    [["1", "1", "b1", 11, 12, "aa"],  # one perfect match
+     ["2", "2", "b5", 21, 22, "bb"],  # different id and barcode matches
      # no match for patron id 3
-     ["4", "b4", 41, 42, "dd"],  # two matches -- one perfect, one imperfect
-     ["4", "b444", 43, 44, "dddd"],
-     ["5", "b555", 51, 52, "eeee"],  # two matches -- both imperfect
-     ["5", "b556", 53, 54, "eeef"],
-     ["6", "b6", 61, 62, "ffff"],  # two matches -- both perfect
-     ["6", "b6", 63, 64, "fffg"],
-     ["7", "b777", 71, 72, "gg"],  # two matches -- same except barcode
-     ["7", "b778", 71, 72, "gg"],
-     ["8", None, 81, 82, "hh"],  # one imperfect match/no barcode
-     ["9", "b9", None, None, None]],  # one perfect match/all null fields
-    columns=["patron_id", "barcode", "ptype_code", "pcode3",
+     ["4", "4", "b4", 41, 42, "dd"],  # two matches -- perfect and imperfect
+     ["4", "4", "b444", 43, 44, "dddd"],
+     ["5", "5", "b555", 51, 52, "eeee"],  # two matches -- both imperfect
+     ["5", "5", "b556", 53, 54, "eeef"],
+     ["6", "6", "b6", 61, 62, "ffff"],  # two matches -- both perfect
+     ["6", "6", "b6", 63, 64, "fffg"],
+     ["7", "7", "b777", 71, 72, "gg"],  # two matches -- same but barcode
+     ["7", "7", "b778", 71, 72, "gg"],
+     ["8", "88", "b8", 81, 82, "hh"],  # two matches -- same but record_num
+     ["8", "89", "b8", 81, 82, "hh"],
+     ["9", "9", None, 91, 92, "ii"],  # one match/no barcode
+     ["10", "10", "b10", None, None, None]],  # one match/all null fields
+    columns=["patron_id", "record_num", "barcode", "ptype_code", "pcode3",
              "patron_home_library_code"])
-_TEST_ID_DF["patron_id"] = _TEST_ID_DF["patron_id"].astype("string")
+_TEST_ID_DF[["patron_id", "record_num"]] = _TEST_ID_DF[
+    ["patron_id", "record_num"]].astype("string")
 
 
 class TestPatronDataHelper:
@@ -108,16 +111,20 @@ def test_barcodes_to_patron_ids_with_duplicates(self, mocker):
                 remove_duplicates=False
             ))
 
-    def test_get_sierra_patron_data_from_ids(self, mocker):
+    def test_get_sierra_patron_data_from_ids_pat_ids(self, mocker):
         RESULT = pd.DataFrame(
-            [["1", "b1", 11, 12, "aa"], ["2", "b2", 21, 22, "bb"],
-             ["3", "b3", 31, 32, "cc"], ["33", "b3", 331, 332, "ccc"],
-             ["4", None, None, None, None],
-             ["5", "b5", 51, 52, "dd"], ["6", "b6", 61, 62, "ee"],
-             ["6", "b66", 61, 62, "ee"], ["7", "b7", 71, 72, "ff"],
-             ["7", "b77", 771, 772, "ffff"]],
-            columns=["patron_id", "barcode", "ptype_code", "pcode3",
-                     "patron_home_library_code"])
+            [["1", "1", "b1", 11, 12, "aa"],
+             ["2", "2", "b2", 21, 22, "bb"],
+             ["3", "3", "b3", 31, 32, "cc"],
+             ["33", "3", "b3", 331, 332, "ccc"],
+             ["4", None, None, None, None, None],
+             ["5", "5", "b5", 51, 52, "dd"],
+             ["6", "6", "b6", 61, 62, "ee"],
+             ["6", "6", "b66", 61, 62, "ee"],
+             ["7", "7", "b7", 71, 72, "ff"],
+             ["7", "77", "b77", 771, 772, "ffff"]],
+            columns=["patron_id", "record_num", "barcode", "ptype_code",
+                     "pcode3", "patron_home_library_code"])
         RESULT["patron_id"] = RESULT["patron_id"].astype("string")
 
         mock_sierra_client = mocker.MagicMock()
@@ -137,7 +144,48 @@ def test_get_sierra_patron_data_from_ids(self, mocker):
         # directly. The workaround is to test the total length of the query
         # plus that each id appears in it.
         query = mock_sierra_client.execute_query.call_args[0][0]
-        assert len(query) == 257
+        assert len(query) == 269
+        assert "WHERE id IN" in query
+        for el in range(1, 9):
+            assert str(el) in query
+
+    def test_get_sierra_patron_data_from_ids_record_nums(self, mocker):
+        RESULT = pd.DataFrame(
+            [["1", "1", "b1", 11., 12., "aa"],
+             ["2", "2", "b2", 21., 22., "bb"],
+             ["3", "3", "b3", 31., 32., "cc"],
+             ["33", "3", "b3", 331., 332., "ccc"],
+             ["5", "5", "b5", 51., 52., "dd"],
+             ["6", "6", "b6", 61., 62., "ee"],
+             ["6", "6", "b66", 61., 62., "ee"],
+             ["7", "7", "b7", 71., 72., "ff"],
+             ["7", "77", "b77", 771., 772., "ffff"]],
+            columns=["patron_id", "record_num", "barcode", "ptype_code",
+                     "pcode3", "patron_home_library_code"],
+            index=[0, 1, 2, 3, 5, 6, 7, 8, 9])
+        RESULT[["patron_id", "record_num"]] = RESULT[
+            ["patron_id", "record_num"]].astype("string")
+
+        mock_sierra_client = mocker.MagicMock()
+        mock_sierra_client.execute_query.return_value = \
+            _TEST_SIERRA_IDS_RESPONSE
+
+        assert_frame_equal(
+            RESULT, get_sierra_patron_data_from_ids(
+                mock_sierra_client, [str(el) for el in range(1, 9)] + ["1",],
+                use_record_num=True,
+            ))
+
+        mock_sierra_client.connect.assert_called_once()
+        mock_sierra_client.execute_query.assert_called_once()
+        mock_sierra_client.close_connection.assert_called_once()
+
+        # Because the set of record_nums is unordered, it can't be tested
+        # directly. The workaround is to test the total length of the query
+        # plus that each id appears in it.
+        query = mock_sierra_client.execute_query.call_args[0][0]
+        assert len(query) == 277
+        assert "WHERE record_num IN" in query
         for el in range(1, 9):
             assert str(el) in query
 
@@ -152,14 +200,18 @@ def test_get_sierra_patron_data_from_ids_unisolated(self, mocker):
         mock_sierra_client.execute_query.assert_called_once()
         mock_sierra_client.close_connection.assert_not_called()
 
-    def test_get_sierra_patron_data_from_ids_without_duplicates(self, mocker):
+    def test_get_sierra_patron_data_from_ids_without_duplicates_pat_ids(
+            self, mocker):
         RESULT = pd.DataFrame(
-            [["1", "b1", 11, 12, "aa"], ["2", "b2", 21, 22, "bb"],
-             ["3", "b3", 31, 32, "cc"], ["33", "b3", 331, 332, "ccc"],
-             ["4", None, None, None, None], ["5", "b5", 51, 52, "dd"],
-             ["6", "b6", 61, 62, "ee"]],
-            columns=["patron_id", "barcode", "ptype_code", "pcode3",
-                     "patron_home_library_code"])
+            [["1", "1", "b1", 11, 12, "aa"],
+             ["2", "2", "b2", 21, 22, "bb"],
+             ["3", "3", "b3", 31, 32, "cc"],
+             ["33", "3", "b3", 331, 332, "ccc"],
+             ["4", None, None, None, None, None],
+             ["5", "5", "b5", 51, 52, "dd"],
+             ["6", "6", "b6", 61, 62, "ee"]],
+            columns=["patron_id", "record_num", "barcode", "ptype_code",
+                     "pcode3", "patron_home_library_code"])
         RESULT["patron_id"] = RESULT["patron_id"].astype("string")
 
         mock_sierra_client = mocker.MagicMock()
@@ -172,22 +224,49 @@ def test_get_sierra_patron_data_from_ids_without_duplicates(self, mocker):
                 remove_duplicates=True
             ))
 
+    def test_get_sierra_patron_data_from_ids_without_duplicates_record_nums(
+            self, mocker):
+        RESULT = pd.DataFrame(
+            [["1", "1", "b1", 11., 12., "aa"],
+             ["2", "2", "b2", 21., 22., "bb"],
+             ["5", "5", "b5", 51., 52., "dd"],
+             ["6", "6", "b6", 61., 62., "ee"],
+             ["7", "7", "b7", 71., 72., "ff"],
+             ["7", "77", "b77", 771., 772., "ffff"]],
+            columns=["patron_id", "record_num", "barcode", "ptype_code",
+                     "pcode3", "patron_home_library_code"],
+            index=[0, 1, 5, 6, 8, 9])
+        RESULT[["patron_id", "record_num"]] = RESULT[
+            ["patron_id", "record_num"]].astype("string")
+
+        mock_sierra_client = mocker.MagicMock()
+        mock_sierra_client.execute_query.return_value = \
+            _TEST_SIERRA_IDS_RESPONSE
+
+        assert_frame_equal(
+            RESULT, get_sierra_patron_data_from_ids(
+                mock_sierra_client, [str(el) for el in range(1, 9)],
+                remove_duplicates=True, use_record_num=True,
+            ))
+
     def test_get_sierra_patron_data_from_barcodes(self, mocker):
         RESULT = pd.DataFrame(
-            [["b1", "1", 11, 12, "aa"],
-             ["b4", "4", 41, 42, "dd"],
-             ["b6", "6", None, None, None],
-             ["b9", "9", None, None, None],
-             ["b2", "2", 21, 22, "bb"],
-             ["b3", "3", None, None, None],
-             ["b5", "5", None, None, None],
-             ["b7", "7", 71, 72, "gg"],
-             ["b8", "8", 81, 82, "hh"]],
-            columns=["barcode", "patron_id", "ptype_code", "pcode3",
-                     "patron_home_library_code"])
-        RESULT["patron_id"] = RESULT["patron_id"].astype("string")
-        TEST_BARCODES = [f"b{i}" for i in range(1, 11)] + ["b1",]
-        TEST_IDS = pd.Series([str(i) for i in range(1, 10)],
+            [["b1", "1", "1", 11, 12, "aa"],
+             ["b4", "4", "4", 41, 42, "dd"],
+             ["b6", "6", None, None, None, None],
+             ["b8", "8", "88", 81, 82, "hh"],
+             ["b10", "10", "10", None, None, None],
+             ["b2", "2", "2", 21, 22, "bb"],
+             ["b3", "3", None, None, None, None],
+             ["b5", "5", None, None, None, None],
+             ["b7", "7", "7", 71, 72, "gg"],
+             ["b9", "9", "9", 91, 92, "ii"]],
+            columns=["barcode", "patron_id", "record_num", "ptype_code",
+                     "pcode3", "patron_home_library_code"])
+        RESULT[["patron_id", "record_num"]] = RESULT[
+            ["patron_id", "record_num"]].astype("string")
+        TEST_BARCODES = [f"b{i}" for i in range(1, 12)] + ["b1",]
+        TEST_IDS = pd.Series([str(i) for i in range(1, 11)],
                              dtype="string", name="patron_id")
         mocked_barcodes_method = mocker.patch(
             "nypl_py_utils.functions.patron_data_helper.barcodes_to_patron_ids",  # noqa: E501