Skip to content

Commit

Permalink
Merge pull request #40 from NYPL/add_record_num
Browse files Browse the repository at this point in the history
Add search by record_num to patron_data_helper
  • Loading branch information
aaronfriedman6 authored Dec 2, 2024
2 parents f9f5746 + e0f64c0 commit 896f130
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 74 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# Changelog
## v1.6.1 11/26/24
- Add record_num capability to patron_data_helper

## v1.6.0 11/20/24
- Added patron_data_helper functions
- Use executemany instead of execute when appropriate in PostgreSQLClient
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ This package contains common Python utility classes and functions.
* Creating a logger in the appropriate format
* Obfuscating a value using bcrypt
* Parsing/building Research Catalog identifiers
* Mapping between barcodes and Sierra patron ids plus getting patron data from Sierra and Redshift using those ids
* Mapping between barcodes and Sierra patron ids plus getting patron data from Sierra and Redshift using those ids or record_nums

## Usage
```python
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "nypl_py_utils"
version = "1.6.0"
version = "1.6.1"
authors = [
{ name="Aaron Friedman", email="[email protected]" },
]
Expand Down
56 changes: 36 additions & 20 deletions src/nypl_py_utils/functions/patron_data_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
WHERE index_tag || index_entry IN ({});"""

_SIERRA_PATRON_DATA_QUERY = """
SELECT id, barcode, ptype_code, pcode3,
SELECT id, record_num, barcode, ptype_code, pcode3,
CASE WHEN LENGTH(TRIM(home_library_code)) = 0
OR TRIM(home_library_code) = 'none' THEN NULL
ELSE TRIM(home_library_code) END
FROM sierra_view.patron_view
WHERE id IN ({});"""
WHERE {id_field} IN ({ids});"""


def barcodes_to_patron_ids(sierra_client, barcodes, isolate_connection=True,
Expand Down Expand Up @@ -73,61 +73,76 @@ def barcodes_to_patron_ids(sierra_client, barcodes, isolate_connection=True,
return df


def get_sierra_patron_data_from_ids(sierra_client, patron_ids,
def get_sierra_patron_data_from_ids(sierra_client, ids,
isolate_connection=True,
remove_duplicates=False):
remove_duplicates=False,
use_record_num=False):
"""
Given Sierra patron ids, returns standard patron fields from Sierra
Parameters
----------
sierra_client: PostgreSQLClient
The client with which to query Sierra
patron_ids: sequence of strings
The sequence of patron ids to be fetched. Must be iterable and without
'None' entries. Each patron id is expected to be a string.
ids: sequence of strings
The sequence of patron ids or record_nums to be fetched. Must be
iterable and without any 'None' entries. Each id is expected to be a
string.
isolate_connection: bool, optional
Whether the database connection should be opened and closed within this
method or whether it will be handled by the user
remove_duplicates: bool, optional
Whether patron ids that map to multiple rows with different values
should be removed
use_record_num: bool, optional
Whether the `ids` given are record_nums rather than patron ids
Returns
-------
DataFrame
A pandas DataFrame with standard patron columns. The 'patron_id' column
is set to be a string.
"""
unique_patron_ids = set(patron_ids)
if unique_patron_ids:
unique_ids = set(ids)
if unique_ids:
logger.info(
f"Fetching Sierra patron data for ({len(unique_patron_ids)}) "
"patrons")
patron_ids_str = ",".join(unique_patron_ids)
f"Fetching Sierra patron data for ({len(unique_ids)}) patrons")
id_field = "record_num" if use_record_num else "id"
ids_str = ",".join(unique_ids)
if isolate_connection:
sierra_client.connect()
raw_data = sierra_client.execute_query(
_SIERRA_PATRON_DATA_QUERY.format(patron_ids_str))
_SIERRA_PATRON_DATA_QUERY.format(id_field=id_field, ids=ids_str))
if isolate_connection:
sierra_client.close_connection()
else:
logger.info("No patron ids given with which to query Sierra")
raw_data = []

df = pd.DataFrame(raw_data, columns=[
"patron_id", "barcode", "ptype_code", "pcode3",
"patron_id", "record_num", "barcode", "ptype_code", "pcode3",
"patron_home_library_code"])
df = df[pd.notnull(df["patron_id"])]
df["patron_id"] = df["patron_id"].astype("Int64").astype("string")
if remove_duplicates:
if use_record_num:
df = df[pd.notnull(df["record_num"])]
df["record_num"] = df["record_num"].astype("Int32").astype("string")

if not remove_duplicates:
return df.drop_duplicates()
elif use_record_num:
# If one patron id maps to two rows that are identical except for the
# barcode, arbitrarily delete one of the rows
df = df.drop_duplicates(
["patron_id", "record_num", "ptype_code", "pcode3",
"patron_home_library_code"])
return df.drop_duplicates("record_num", keep=False)
else:
# If one patron id maps to two rows that are identical except for the
# barcode or record_num, arbitrarily delete one of the rows
df = df.drop_duplicates(
["patron_id", "ptype_code", "pcode3", "patron_home_library_code"])
return df.drop_duplicates("patron_id", keep=False)
else:
return df.drop_duplicates()


def get_sierra_patron_data_from_barcodes(sierra_client, barcodes,
Expand Down Expand Up @@ -159,12 +174,12 @@ def get_sierra_patron_data_from_barcodes(sierra_client, barcodes,
barcode_patron_id_df = barcodes_to_patron_ids(
sierra_client, barcodes, False, True)
patron_data_df = get_sierra_patron_data_from_ids(
sierra_client, barcode_patron_id_df["patron_id"], False, False)
sierra_client, barcode_patron_id_df["patron_id"], False, False, False)
if isolate_connection:
sierra_client.close_connection()

# If one patron id maps to two rows that are identical except for the
# barcode, arbitrarily delete one of the rows
# barcode or record_num, arbitrarily delete one of the rows
patron_data_df = patron_data_df.drop_duplicates(
["patron_id", "ptype_code", "pcode3", "patron_home_library_code"])

Expand All @@ -181,7 +196,8 @@ def get_sierra_patron_data_from_barcodes(sierra_client, barcodes,
how="left", on="patron_id")
df = pd.concat([perfect_match_df, imperfect_match_df], ignore_index=True)
df.loc[df.duplicated("barcode", keep=False), [
"ptype_code", "pcode3", "patron_home_library_code"]] = None
"record_num", "ptype_code", "pcode3",
"patron_home_library_code"]] = None
return df.drop_duplicates("barcode")


Expand Down
183 changes: 131 additions & 52 deletions tests/test_patron_data_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,36 +19,39 @@
]

_TEST_SIERRA_IDS_RESPONSE = [
(1, "b1", 11, 12, "aa"), (2, "b2", 21, 22, "bb"),
(3, "b3", 31, 32, "cc"), (33, "b3", 331, 332, "ccc"),
(4, None, None, None, None), (5, "b5", 51, 52, "dd"),
(6, "b6", 61, 62, "ee"), (6, "b66", 61, 62, "ee"),
(7, "b7", 71, 72, "ff"), (7, "b77", 771, 772, "ffff"),
(None, "b4", None, None, None), (5, "b5", 51, 52, "dd"),
(1, "1", "b1", 11, 12, "aa"), (2, "2", "b2", 21, 22, "bb"),
(3, "3", "b3", 31, 32, "cc"), (33, "3", "b3", 331, 332, "ccc"),
(4, None, None, None, None, None), (5, "5", "b5", 51, 52, "dd"),
(6, "6", "b6", 61, 62, "ee"), (6, "6", "b66", 61, 62, "ee"),
(7, "7", "b7", 71, 72, "ff"), (7, "77", "b77", 771, 772, "ffff"),
(None, "4", "b4", None, None, None), (5, "5", "b5", 51, 52, "dd"),
]

_TEST_BARCODE_DF = pd.DataFrame(
[[f"b{i}", str(i)] for i in range(1, 10)],
[[f"b{i}", str(i)] for i in range(1, 11)],
columns=["barcode", "patron_id"])
_TEST_BARCODE_DF["patron_id"] = _TEST_BARCODE_DF["patron_id"].astype("string")

_TEST_ID_DF = pd.DataFrame(
[["1", "b1", 11, 12, "aa"], # one perfect match
["2", "b5", 21, 22, "bb"], # different id and barcode matches
[["1", "1", "b1", 11, 12, "aa"], # one perfect match
["2", "2", "b5", 21, 22, "bb"], # different id and barcode matches
# no match for patron id 3
["4", "b4", 41, 42, "dd"], # two matches -- one perfect, one imperfect
["4", "b444", 43, 44, "dddd"],
["5", "b555", 51, 52, "eeee"], # two matches -- both imperfect
["5", "b556", 53, 54, "eeef"],
["6", "b6", 61, 62, "ffff"], # two matches -- both perfect
["6", "b6", 63, 64, "fffg"],
["7", "b777", 71, 72, "gg"], # two matches -- same except barcode
["7", "b778", 71, 72, "gg"],
["8", None, 81, 82, "hh"], # one imperfect match/no barcode
["9", "b9", None, None, None]], # one perfect match/all null fields
columns=["patron_id", "barcode", "ptype_code", "pcode3",
["4", "4", "b4", 41, 42, "dd"], # two matches -- perfect and imperfect
["4", "4", "b444", 43, 44, "dddd"],
["5", "5", "b555", 51, 52, "eeee"], # two matches -- both imperfect
["5", "5", "b556", 53, 54, "eeef"],
["6", "6", "b6", 61, 62, "ffff"], # two matches -- both perfect
["6", "6", "b6", 63, 64, "fffg"],
["7", "7", "b777", 71, 72, "gg"], # two matches -- same but barcode
["7", "7", "b778", 71, 72, "gg"],
["8", "88", "b8", 81, 82, "hh"], # two matches -- same but record_num
["8", "89", "b8", 81, 82, "hh"],
["9", "9", None, 91, 92, "ii"], # one match/no barcode
["10", "10", "b10", None, None, None]], # one match/all null fields
columns=["patron_id", "record_num", "barcode", "ptype_code", "pcode3",
"patron_home_library_code"])
_TEST_ID_DF["patron_id"] = _TEST_ID_DF["patron_id"].astype("string")
_TEST_ID_DF[["patron_id", "record_num"]] = _TEST_ID_DF[
["patron_id", "record_num"]].astype("string")


class TestPatronDataHelper:
Expand Down Expand Up @@ -108,16 +111,20 @@ def test_barcodes_to_patron_ids_with_duplicates(self, mocker):
remove_duplicates=False
))

def test_get_sierra_patron_data_from_ids(self, mocker):
def test_get_sierra_patron_data_from_ids_pat_ids(self, mocker):
RESULT = pd.DataFrame(
[["1", "b1", 11, 12, "aa"], ["2", "b2", 21, 22, "bb"],
["3", "b3", 31, 32, "cc"], ["33", "b3", 331, 332, "ccc"],
["4", None, None, None, None],
["5", "b5", 51, 52, "dd"], ["6", "b6", 61, 62, "ee"],
["6", "b66", 61, 62, "ee"], ["7", "b7", 71, 72, "ff"],
["7", "b77", 771, 772, "ffff"]],
columns=["patron_id", "barcode", "ptype_code", "pcode3",
"patron_home_library_code"])
[["1", "1", "b1", 11, 12, "aa"],
["2", "2", "b2", 21, 22, "bb"],
["3", "3", "b3", 31, 32, "cc"],
["33", "3", "b3", 331, 332, "ccc"],
["4", None, None, None, None, None],
["5", "5", "b5", 51, 52, "dd"],
["6", "6", "b6", 61, 62, "ee"],
["6", "6", "b66", 61, 62, "ee"],
["7", "7", "b7", 71, 72, "ff"],
["7", "77", "b77", 771, 772, "ffff"]],
columns=["patron_id", "record_num", "barcode", "ptype_code",
"pcode3", "patron_home_library_code"])
RESULT["patron_id"] = RESULT["patron_id"].astype("string")

mock_sierra_client = mocker.MagicMock()
Expand All @@ -137,7 +144,48 @@ def test_get_sierra_patron_data_from_ids(self, mocker):
# directly. The workaround is to test the total length of the query
# plus that each id appears in it.
query = mock_sierra_client.execute_query.call_args[0][0]
assert len(query) == 257
assert len(query) == 269
assert "WHERE id IN" in query
for el in range(1, 9):
assert str(el) in query

def test_get_sierra_patron_data_from_ids_record_nums(self, mocker):
RESULT = pd.DataFrame(
[["1", "1", "b1", 11., 12., "aa"],
["2", "2", "b2", 21., 22., "bb"],
["3", "3", "b3", 31., 32., "cc"],
["33", "3", "b3", 331., 332., "ccc"],
["5", "5", "b5", 51., 52., "dd"],
["6", "6", "b6", 61., 62., "ee"],
["6", "6", "b66", 61., 62., "ee"],
["7", "7", "b7", 71., 72., "ff"],
["7", "77", "b77", 771., 772., "ffff"]],
columns=["patron_id", "record_num", "barcode", "ptype_code",
"pcode3", "patron_home_library_code"],
index=[0, 1, 2, 3, 5, 6, 7, 8, 9])
RESULT[["patron_id", "record_num"]] = RESULT[
["patron_id", "record_num"]].astype("string")

mock_sierra_client = mocker.MagicMock()
mock_sierra_client.execute_query.return_value = \
_TEST_SIERRA_IDS_RESPONSE

assert_frame_equal(
RESULT, get_sierra_patron_data_from_ids(
mock_sierra_client, [str(el) for el in range(1, 9)] + ["1",],
use_record_num=True,
))

mock_sierra_client.connect.assert_called_once()
mock_sierra_client.execute_query.assert_called_once()
mock_sierra_client.close_connection.assert_called_once()

# Because the set of record_nums is unordered, it can't be tested
# directly. The workaround is to test the total length of the query
# plus that each id appears in it.
query = mock_sierra_client.execute_query.call_args[0][0]
assert len(query) == 277
assert "WHERE record_num IN" in query
for el in range(1, 9):
assert str(el) in query

Expand All @@ -152,14 +200,18 @@ def test_get_sierra_patron_data_from_ids_unisolated(self, mocker):
mock_sierra_client.execute_query.assert_called_once()
mock_sierra_client.close_connection.assert_not_called()

def test_get_sierra_patron_data_from_ids_without_duplicates(self, mocker):
def test_get_sierra_patron_data_from_ids_without_duplicates_pat_ids(
self, mocker):
RESULT = pd.DataFrame(
[["1", "b1", 11, 12, "aa"], ["2", "b2", 21, 22, "bb"],
["3", "b3", 31, 32, "cc"], ["33", "b3", 331, 332, "ccc"],
["4", None, None, None, None], ["5", "b5", 51, 52, "dd"],
["6", "b6", 61, 62, "ee"]],
columns=["patron_id", "barcode", "ptype_code", "pcode3",
"patron_home_library_code"])
[["1", "1", "b1", 11, 12, "aa"],
["2", "2", "b2", 21, 22, "bb"],
["3", "3", "b3", 31, 32, "cc"],
["33", "3", "b3", 331, 332, "ccc"],
["4", None, None, None, None, None],
["5", "5", "b5", 51, 52, "dd"],
["6", "6", "b6", 61, 62, "ee"]],
columns=["patron_id", "record_num", "barcode", "ptype_code",
"pcode3", "patron_home_library_code"])
RESULT["patron_id"] = RESULT["patron_id"].astype("string")

mock_sierra_client = mocker.MagicMock()
Expand All @@ -172,22 +224,49 @@ def test_get_sierra_patron_data_from_ids_without_duplicates(self, mocker):
remove_duplicates=True
))

def test_get_sierra_patron_data_from_ids_without_duplicates_record_nums(
self, mocker):
RESULT = pd.DataFrame(
[["1", "1", "b1", 11., 12., "aa"],
["2", "2", "b2", 21., 22., "bb"],
["5", "5", "b5", 51., 52., "dd"],
["6", "6", "b6", 61., 62., "ee"],
["7", "7", "b7", 71., 72., "ff"],
["7", "77", "b77", 771., 772., "ffff"]],
columns=["patron_id", "record_num", "barcode", "ptype_code",
"pcode3", "patron_home_library_code"],
index=[0, 1, 5, 6, 8, 9])
RESULT[["patron_id", "record_num"]] = RESULT[
["patron_id", "record_num"]].astype("string")

mock_sierra_client = mocker.MagicMock()
mock_sierra_client.execute_query.return_value = \
_TEST_SIERRA_IDS_RESPONSE

assert_frame_equal(
RESULT, get_sierra_patron_data_from_ids(
mock_sierra_client, [str(el) for el in range(1, 9)],
remove_duplicates=True, use_record_num=True,
))

def test_get_sierra_patron_data_from_barcodes(self, mocker):
RESULT = pd.DataFrame(
[["b1", "1", 11, 12, "aa"],
["b4", "4", 41, 42, "dd"],
["b6", "6", None, None, None],
["b9", "9", None, None, None],
["b2", "2", 21, 22, "bb"],
["b3", "3", None, None, None],
["b5", "5", None, None, None],
["b7", "7", 71, 72, "gg"],
["b8", "8", 81, 82, "hh"]],
columns=["barcode", "patron_id", "ptype_code", "pcode3",
"patron_home_library_code"])
RESULT["patron_id"] = RESULT["patron_id"].astype("string")
TEST_BARCODES = [f"b{i}" for i in range(1, 11)] + ["b1",]
TEST_IDS = pd.Series([str(i) for i in range(1, 10)],
[["b1", "1", "1", 11, 12, "aa"],
["b4", "4", "4", 41, 42, "dd"],
["b6", "6", None, None, None, None],
["b8", "8", "88", 81, 82, "hh"],
["b10", "10", "10", None, None, None],
["b2", "2", "2", 21, 22, "bb"],
["b3", "3", None, None, None, None],
["b5", "5", None, None, None, None],
["b7", "7", "7", 71, 72, "gg"],
["b9", "9", "9", 91, 92, "ii"]],
columns=["barcode", "patron_id", "record_num", "ptype_code",
"pcode3", "patron_home_library_code"])
RESULT[["patron_id", "record_num"]] = RESULT[
["patron_id", "record_num"]].astype("string")
TEST_BARCODES = [f"b{i}" for i in range(1, 12)] + ["b1",]
TEST_IDS = pd.Series([str(i) for i in range(1, 11)],
dtype="string", name="patron_id")
mocked_barcodes_method = mocker.patch(
"nypl_py_utils.functions.patron_data_helper.barcodes_to_patron_ids", # noqa: E501
Expand Down

0 comments on commit 896f130

Please sign in to comment.