Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# Changelog
## v1.6.1 11/26/24
- Add record_num capability to patron_data_helper

## v1.6.0 11/20/24
- Added patron_data_helper functions
- Use executemany instead of execute when appropriate in PostgreSQLClient
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ This package contains common Python utility classes and functions.
* Creating a logger in the appropriate format
* Obfuscating a value using bcrypt
* Parsing/building Research Catalog identifiers
* Mapping between barcodes and Sierra patron ids plus getting patron data from Sierra and Redshift using those ids
* Mapping between barcodes and Sierra patron ids plus getting patron data from Sierra and Redshift using those ids or record_nums

## Usage
```python
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "nypl_py_utils"
version = "1.6.0"
version = "1.6.1"
authors = [
{ name="Aaron Friedman", email="aaronfriedman@nypl.org" },
]
Expand Down
56 changes: 36 additions & 20 deletions src/nypl_py_utils/functions/patron_data_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
WHERE index_tag || index_entry IN ({});"""

_SIERRA_PATRON_DATA_QUERY = """
SELECT id, barcode, ptype_code, pcode3,
SELECT id, record_num, barcode, ptype_code, pcode3,
CASE WHEN LENGTH(TRIM(home_library_code)) = 0
OR TRIM(home_library_code) = 'none' THEN NULL
ELSE TRIM(home_library_code) END
FROM sierra_view.patron_view
WHERE id IN ({});"""
WHERE {id_field} IN ({ids});"""


def barcodes_to_patron_ids(sierra_client, barcodes, isolate_connection=True,
Expand Down Expand Up @@ -73,61 +73,76 @@ def barcodes_to_patron_ids(sierra_client, barcodes, isolate_connection=True,
return df


def get_sierra_patron_data_from_ids(sierra_client, patron_ids,
def get_sierra_patron_data_from_ids(sierra_client, ids,
isolate_connection=True,
remove_duplicates=False):
remove_duplicates=False,
use_record_num=False):
"""
Given Sierra patron ids, returns standard patron fields from Sierra

Parameters
----------
sierra_client: PostgreSQLClient
The client with which to query Sierra
patron_ids: sequence of strings
The sequence of patron ids to be fetched. Must be iterable and without
'None' entries. Each patron id is expected to be a string.
ids: sequence of strings
The sequence of patron ids or record_nums to be fetched. Must be
iterable and without any 'None' entries. Each id is expected to be a
string.
isolate_connection: bool, optional
Whether the database connection should be opened and closed within this
method or whether it will be handled by the user
remove_duplicates: bool, optional
Whether patron ids that map to multiple rows with different values
should be removed
use_record_num: bool, optional
Whether the `ids` given are record_nums rather than patron ids

Returns
-------
DataFrame
A pandas DataFrame with standard patron columns. The 'patron_id' column
is set to be a string.
"""
unique_patron_ids = set(patron_ids)
if unique_patron_ids:
unique_ids = set(ids)
if unique_ids:
logger.info(
f"Fetching Sierra patron data for ({len(unique_patron_ids)}) "
"patrons")
patron_ids_str = ",".join(unique_patron_ids)
f"Fetching Sierra patron data for ({len(unique_ids)}) patrons")
id_field = "record_num" if use_record_num else "id"
ids_str = ",".join(unique_ids)
if isolate_connection:
sierra_client.connect()
raw_data = sierra_client.execute_query(
_SIERRA_PATRON_DATA_QUERY.format(patron_ids_str))
_SIERRA_PATRON_DATA_QUERY.format(id_field=id_field, ids=ids_str))
if isolate_connection:
sierra_client.close_connection()
else:
logger.info("No patron ids given with which to query Sierra")
raw_data = []

df = pd.DataFrame(raw_data, columns=[
"patron_id", "barcode", "ptype_code", "pcode3",
"patron_id", "record_num", "barcode", "ptype_code", "pcode3",
"patron_home_library_code"])
df = df[pd.notnull(df["patron_id"])]
df["patron_id"] = df["patron_id"].astype("Int64").astype("string")
if remove_duplicates:
if use_record_num:
df = df[pd.notnull(df["record_num"])]
df["record_num"] = df["record_num"].astype("Int32").astype("string")

if not remove_duplicates:
return df.drop_duplicates()
elif use_record_num:
# If one patron id maps to two rows that are identical except for the
# barcode, arbitrarily delete one of the rows
df = df.drop_duplicates(
["patron_id", "record_num", "ptype_code", "pcode3",
"patron_home_library_code"])
return df.drop_duplicates("record_num", keep=False)
else:
# If one patron id maps to two rows that are identical except for the
# barcode or record_num, arbitrarily delete one of the rows
df = df.drop_duplicates(
["patron_id", "ptype_code", "pcode3", "patron_home_library_code"])
return df.drop_duplicates("patron_id", keep=False)
else:
return df.drop_duplicates()


def get_sierra_patron_data_from_barcodes(sierra_client, barcodes,
Expand Down Expand Up @@ -159,12 +174,12 @@ def get_sierra_patron_data_from_barcodes(sierra_client, barcodes,
barcode_patron_id_df = barcodes_to_patron_ids(
sierra_client, barcodes, False, True)
patron_data_df = get_sierra_patron_data_from_ids(
sierra_client, barcode_patron_id_df["patron_id"], False, False)
sierra_client, barcode_patron_id_df["patron_id"], False, False, False)
if isolate_connection:
sierra_client.close_connection()

# If one patron id maps to two rows that are identical except for the
# barcode, arbitrarily delete one of the rows
# barcode or record_num, arbitrarily delete one of the rows
patron_data_df = patron_data_df.drop_duplicates(
["patron_id", "ptype_code", "pcode3", "patron_home_library_code"])

Expand All @@ -181,7 +196,8 @@ def get_sierra_patron_data_from_barcodes(sierra_client, barcodes,
how="left", on="patron_id")
df = pd.concat([perfect_match_df, imperfect_match_df], ignore_index=True)
df.loc[df.duplicated("barcode", keep=False), [
"ptype_code", "pcode3", "patron_home_library_code"]] = None
"record_num", "ptype_code", "pcode3",
"patron_home_library_code"]] = None
return df.drop_duplicates("barcode")


Expand Down
183 changes: 131 additions & 52 deletions tests/test_patron_data_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,36 +19,39 @@
]

_TEST_SIERRA_IDS_RESPONSE = [
(1, "b1", 11, 12, "aa"), (2, "b2", 21, 22, "bb"),
(3, "b3", 31, 32, "cc"), (33, "b3", 331, 332, "ccc"),
(4, None, None, None, None), (5, "b5", 51, 52, "dd"),
(6, "b6", 61, 62, "ee"), (6, "b66", 61, 62, "ee"),
(7, "b7", 71, 72, "ff"), (7, "b77", 771, 772, "ffff"),
(None, "b4", None, None, None), (5, "b5", 51, 52, "dd"),
(1, "1", "b1", 11, 12, "aa"), (2, "2", "b2", 21, 22, "bb"),
(3, "3", "b3", 31, 32, "cc"), (33, "3", "b3", 331, 332, "ccc"),
(4, None, None, None, None, None), (5, "5", "b5", 51, 52, "dd"),
(6, "6", "b6", 61, 62, "ee"), (6, "6", "b66", 61, 62, "ee"),
(7, "7", "b7", 71, 72, "ff"), (7, "77", "b77", 771, 772, "ffff"),
(None, "4", "b4", None, None, None), (5, "5", "b5", 51, 52, "dd"),
]

_TEST_BARCODE_DF = pd.DataFrame(
[[f"b{i}", str(i)] for i in range(1, 10)],
[[f"b{i}", str(i)] for i in range(1, 11)],
columns=["barcode", "patron_id"])
_TEST_BARCODE_DF["patron_id"] = _TEST_BARCODE_DF["patron_id"].astype("string")

_TEST_ID_DF = pd.DataFrame(
[["1", "b1", 11, 12, "aa"], # one perfect match
["2", "b5", 21, 22, "bb"], # different id and barcode matches
[["1", "1", "b1", 11, 12, "aa"], # one perfect match
["2", "2", "b5", 21, 22, "bb"], # different id and barcode matches
# no match for patron id 3
["4", "b4", 41, 42, "dd"], # two matches -- one perfect, one imperfect
["4", "b444", 43, 44, "dddd"],
["5", "b555", 51, 52, "eeee"], # two matches -- both imperfect
["5", "b556", 53, 54, "eeef"],
["6", "b6", 61, 62, "ffff"], # two matches -- both perfect
["6", "b6", 63, 64, "fffg"],
["7", "b777", 71, 72, "gg"], # two matches -- same except barcode
["7", "b778", 71, 72, "gg"],
["8", None, 81, 82, "hh"], # one imperfect match/no barcode
["9", "b9", None, None, None]], # one perfect match/all null fields
columns=["patron_id", "barcode", "ptype_code", "pcode3",
["4", "4", "b4", 41, 42, "dd"], # two matches -- perfect and imperfect
["4", "4", "b444", 43, 44, "dddd"],
["5", "5", "b555", 51, 52, "eeee"], # two matches -- both imperfect
["5", "5", "b556", 53, 54, "eeef"],
["6", "6", "b6", 61, 62, "ffff"], # two matches -- both perfect
["6", "6", "b6", 63, 64, "fffg"],
["7", "7", "b777", 71, 72, "gg"], # two matches -- same but barcode
["7", "7", "b778", 71, 72, "gg"],
["8", "88", "b8", 81, 82, "hh"], # two matches -- same but record_num
["8", "89", "b8", 81, 82, "hh"],
["9", "9", None, 91, 92, "ii"], # one match/no barcode
["10", "10", "b10", None, None, None]], # one match/all null fields
columns=["patron_id", "record_num", "barcode", "ptype_code", "pcode3",
"patron_home_library_code"])
_TEST_ID_DF["patron_id"] = _TEST_ID_DF["patron_id"].astype("string")
_TEST_ID_DF[["patron_id", "record_num"]] = _TEST_ID_DF[
["patron_id", "record_num"]].astype("string")


class TestPatronDataHelper:
Expand Down Expand Up @@ -108,16 +111,20 @@ def test_barcodes_to_patron_ids_with_duplicates(self, mocker):
remove_duplicates=False
))

def test_get_sierra_patron_data_from_ids(self, mocker):
def test_get_sierra_patron_data_from_ids_pat_ids(self, mocker):
RESULT = pd.DataFrame(
[["1", "b1", 11, 12, "aa"], ["2", "b2", 21, 22, "bb"],
["3", "b3", 31, 32, "cc"], ["33", "b3", 331, 332, "ccc"],
["4", None, None, None, None],
["5", "b5", 51, 52, "dd"], ["6", "b6", 61, 62, "ee"],
["6", "b66", 61, 62, "ee"], ["7", "b7", 71, 72, "ff"],
["7", "b77", 771, 772, "ffff"]],
columns=["patron_id", "barcode", "ptype_code", "pcode3",
"patron_home_library_code"])
[["1", "1", "b1", 11, 12, "aa"],
["2", "2", "b2", 21, 22, "bb"],
["3", "3", "b3", 31, 32, "cc"],
["33", "3", "b3", 331, 332, "ccc"],
["4", None, None, None, None, None],
["5", "5", "b5", 51, 52, "dd"],
["6", "6", "b6", 61, 62, "ee"],
["6", "6", "b66", 61, 62, "ee"],
["7", "7", "b7", 71, 72, "ff"],
["7", "77", "b77", 771, 772, "ffff"]],
columns=["patron_id", "record_num", "barcode", "ptype_code",
"pcode3", "patron_home_library_code"])
RESULT["patron_id"] = RESULT["patron_id"].astype("string")

mock_sierra_client = mocker.MagicMock()
Expand All @@ -137,7 +144,48 @@ def test_get_sierra_patron_data_from_ids(self, mocker):
# directly. The workaround is to test the total length of the query
# plus that each id appears in it.
query = mock_sierra_client.execute_query.call_args[0][0]
assert len(query) == 257
assert len(query) == 269
assert "WHERE id IN" in query
for el in range(1, 9):
assert str(el) in query

def test_get_sierra_patron_data_from_ids_record_nums(self, mocker):
RESULT = pd.DataFrame(
[["1", "1", "b1", 11., 12., "aa"],
["2", "2", "b2", 21., 22., "bb"],
["3", "3", "b3", 31., 32., "cc"],
["33", "3", "b3", 331., 332., "ccc"],
["5", "5", "b5", 51., 52., "dd"],
["6", "6", "b6", 61., 62., "ee"],
["6", "6", "b66", 61., 62., "ee"],
["7", "7", "b7", 71., 72., "ff"],
["7", "77", "b77", 771., 772., "ffff"]],
columns=["patron_id", "record_num", "barcode", "ptype_code",
"pcode3", "patron_home_library_code"],
index=[0, 1, 2, 3, 5, 6, 7, 8, 9])
RESULT[["patron_id", "record_num"]] = RESULT[
["patron_id", "record_num"]].astype("string")

mock_sierra_client = mocker.MagicMock()
mock_sierra_client.execute_query.return_value = \
_TEST_SIERRA_IDS_RESPONSE

assert_frame_equal(
RESULT, get_sierra_patron_data_from_ids(
mock_sierra_client, [str(el) for el in range(1, 9)] + ["1",],
use_record_num=True,
))

mock_sierra_client.connect.assert_called_once()
mock_sierra_client.execute_query.assert_called_once()
mock_sierra_client.close_connection.assert_called_once()

# Because the set of record_nums is unordered, it can't be tested
# directly. The workaround is to test the total length of the query
# plus that each id appears in it.
query = mock_sierra_client.execute_query.call_args[0][0]
assert len(query) == 277
assert "WHERE record_num IN" in query
for el in range(1, 9):
assert str(el) in query

Expand All @@ -152,14 +200,18 @@ def test_get_sierra_patron_data_from_ids_unisolated(self, mocker):
mock_sierra_client.execute_query.assert_called_once()
mock_sierra_client.close_connection.assert_not_called()

def test_get_sierra_patron_data_from_ids_without_duplicates(self, mocker):
def test_get_sierra_patron_data_from_ids_without_duplicates_pat_ids(
self, mocker):
RESULT = pd.DataFrame(
[["1", "b1", 11, 12, "aa"], ["2", "b2", 21, 22, "bb"],
["3", "b3", 31, 32, "cc"], ["33", "b3", 331, 332, "ccc"],
["4", None, None, None, None], ["5", "b5", 51, 52, "dd"],
["6", "b6", 61, 62, "ee"]],
columns=["patron_id", "barcode", "ptype_code", "pcode3",
"patron_home_library_code"])
[["1", "1", "b1", 11, 12, "aa"],
["2", "2", "b2", 21, 22, "bb"],
["3", "3", "b3", 31, 32, "cc"],
["33", "3", "b3", 331, 332, "ccc"],
["4", None, None, None, None, None],
["5", "5", "b5", 51, 52, "dd"],
["6", "6", "b6", 61, 62, "ee"]],
columns=["patron_id", "record_num", "barcode", "ptype_code",
"pcode3", "patron_home_library_code"])
RESULT["patron_id"] = RESULT["patron_id"].astype("string")

mock_sierra_client = mocker.MagicMock()
Expand All @@ -172,22 +224,49 @@ def test_get_sierra_patron_data_from_ids_without_duplicates(self, mocker):
remove_duplicates=True
))

def test_get_sierra_patron_data_from_ids_without_duplicates_record_nums(
self, mocker):
RESULT = pd.DataFrame(
[["1", "1", "b1", 11., 12., "aa"],
["2", "2", "b2", 21., 22., "bb"],
["5", "5", "b5", 51., 52., "dd"],
["6", "6", "b6", 61., 62., "ee"],
["7", "7", "b7", 71., 72., "ff"],
["7", "77", "b77", 771., 772., "ffff"]],
columns=["patron_id", "record_num", "barcode", "ptype_code",
"pcode3", "patron_home_library_code"],
index=[0, 1, 5, 6, 8, 9])
RESULT[["patron_id", "record_num"]] = RESULT[
["patron_id", "record_num"]].astype("string")

mock_sierra_client = mocker.MagicMock()
mock_sierra_client.execute_query.return_value = \
_TEST_SIERRA_IDS_RESPONSE

assert_frame_equal(
RESULT, get_sierra_patron_data_from_ids(
mock_sierra_client, [str(el) for el in range(1, 9)],
remove_duplicates=True, use_record_num=True,
))

def test_get_sierra_patron_data_from_barcodes(self, mocker):
RESULT = pd.DataFrame(
[["b1", "1", 11, 12, "aa"],
["b4", "4", 41, 42, "dd"],
["b6", "6", None, None, None],
["b9", "9", None, None, None],
["b2", "2", 21, 22, "bb"],
["b3", "3", None, None, None],
["b5", "5", None, None, None],
["b7", "7", 71, 72, "gg"],
["b8", "8", 81, 82, "hh"]],
columns=["barcode", "patron_id", "ptype_code", "pcode3",
"patron_home_library_code"])
RESULT["patron_id"] = RESULT["patron_id"].astype("string")
TEST_BARCODES = [f"b{i}" for i in range(1, 11)] + ["b1",]
TEST_IDS = pd.Series([str(i) for i in range(1, 10)],
[["b1", "1", "1", 11, 12, "aa"],
["b4", "4", "4", 41, 42, "dd"],
["b6", "6", None, None, None, None],
["b8", "8", "88", 81, 82, "hh"],
["b10", "10", "10", None, None, None],
["b2", "2", "2", 21, 22, "bb"],
["b3", "3", None, None, None, None],
["b5", "5", None, None, None, None],
["b7", "7", "7", 71, 72, "gg"],
["b9", "9", "9", 91, 92, "ii"]],
columns=["barcode", "patron_id", "record_num", "ptype_code",
"pcode3", "patron_home_library_code"])
RESULT[["patron_id", "record_num"]] = RESULT[
["patron_id", "record_num"]].astype("string")
TEST_BARCODES = [f"b{i}" for i in range(1, 12)] + ["b1",]
TEST_IDS = pd.Series([str(i) for i in range(1, 11)],
dtype="string", name="patron_id")
mocked_barcodes_method = mocker.patch(
"nypl_py_utils.functions.patron_data_helper.barcodes_to_patron_ids", # noqa: E501
Expand Down