From e0f64c0ce5bc54e231480cb8beb873123cef571d Mon Sep 17 00:00:00 2001 From: aaronfriedman Date: Tue, 26 Nov 2024 12:28:10 -0700 Subject: [PATCH] Add search by record_num to patron_data_helper --- CHANGELOG.md | 3 + README.md | 2 +- pyproject.toml | 2 +- .../functions/patron_data_helper.py | 56 ++++-- tests/test_patron_data_helper.py | 183 +++++++++++++----- 5 files changed, 172 insertions(+), 74 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a772e4..62a5731 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,7 @@ # Changelog +## v1.6.1 11/26/24 +- Add record_num capability to patron_data_helper + ## v1.6.0 11/20/24 - Added patron_data_helper functions - Use executemany instead of execute when appropriate in PostgreSQLClient diff --git a/README.md b/README.md index ff9ba81..9ea85a0 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ This package contains common Python utility classes and functions. * Creating a logger in the appropriate format * Obfuscating a value using bcrypt * Parsing/building Research Catalog identifiers -* Mapping between barcodes and Sierra patron ids plus getting patron data from Sierra and Redshift using those ids +* Mapping between barcodes and Sierra patron ids plus getting patron data from Sierra and Redshift using those ids or record_nums ## Usage ```python diff --git a/pyproject.toml b/pyproject.toml index a410fc0..8f4a459 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "nypl_py_utils" -version = "1.6.0" +version = "1.6.1" authors = [ { name="Aaron Friedman", email="aaronfriedman@nypl.org" }, ] diff --git a/src/nypl_py_utils/functions/patron_data_helper.py b/src/nypl_py_utils/functions/patron_data_helper.py index b97b2a1..d3a10d9 100644 --- a/src/nypl_py_utils/functions/patron_data_helper.py +++ b/src/nypl_py_utils/functions/patron_data_helper.py @@ -15,12 +15,12 @@ WHERE index_tag || index_entry IN ({});""" _SIERRA_PATRON_DATA_QUERY = """ - SELECT id, barcode, ptype_code, pcode3, + SELECT id, record_num, barcode, ptype_code, pcode3, CASE WHEN LENGTH(TRIM(home_library_code)) = 0 OR TRIM(home_library_code) = 'none' THEN NULL ELSE TRIM(home_library_code) END FROM sierra_view.patron_view - WHERE id IN ({});""" + WHERE {id_field} IN ({ids});""" def barcodes_to_patron_ids(sierra_client, barcodes, isolate_connection=True, @@ -73,9 +73,10 @@ def barcodes_to_patron_ids(sierra_client, barcodes, isolate_connection=True, return df -def get_sierra_patron_data_from_ids(sierra_client, patron_ids, +def get_sierra_patron_data_from_ids(sierra_client, ids, isolate_connection=True, - remove_duplicates=False): + remove_duplicates=False, + use_record_num=False): """ Given Sierra patron ids, returns standard patron fields from Sierra @@ -83,15 +84,18 @@ def get_sierra_patron_data_from_ids(sierra_client, patron_ids, ---------- sierra_client: PostgreSQLClient The client with which to query Sierra - patron_ids: sequence of strings - The sequence of patron ids to be fetched. Must be iterable and without - 'None' entries. Each patron id is expected to be a string. + ids: sequence of strings + The sequence of patron ids or record_nums to be fetched. Must be + iterable and without any 'None' entries. Each id is expected to be a + string. isolate_connection: bool, optional Whether the database connection should be opened and closed within this method or whether it will be handled by the user remove_duplicates: bool, optional Whether patron ids that map to multiple rows with different values should be removed + use_record_num: bool, optional + Whether the `ids` given are record_nums rather than patron ids Returns ------- @@ -99,16 +103,16 @@ def get_sierra_patron_data_from_ids(sierra_client, patron_ids, A pandas DataFrame with standard patron columns. The 'patron_id' column is set to be a string. """ - unique_patron_ids = set(patron_ids) - if unique_patron_ids: + unique_ids = set(ids) + if unique_ids: logger.info( - f"Fetching Sierra patron data for ({len(unique_patron_ids)}) " - "patrons") - patron_ids_str = ",".join(unique_patron_ids) + f"Fetching Sierra patron data for ({len(unique_ids)}) patrons") + id_field = "record_num" if use_record_num else "id" + ids_str = ",".join(unique_ids) if isolate_connection: sierra_client.connect() raw_data = sierra_client.execute_query( - _SIERRA_PATRON_DATA_QUERY.format(patron_ids_str)) + _SIERRA_PATRON_DATA_QUERY.format(id_field=id_field, ids=ids_str)) if isolate_connection: sierra_client.close_connection() else: @@ -116,18 +120,29 @@ def get_sierra_patron_data_from_ids(sierra_client, patron_ids, raw_data = [] df = pd.DataFrame(raw_data, columns=[ - "patron_id", "barcode", "ptype_code", "pcode3", + "patron_id", "record_num", "barcode", "ptype_code", "pcode3", "patron_home_library_code"]) df = df[pd.notnull(df["patron_id"])] df["patron_id"] = df["patron_id"].astype("Int64").astype("string") - if remove_duplicates: + if use_record_num: + df = df[pd.notnull(df["record_num"])] + df["record_num"] = df["record_num"].astype("Int32").astype("string") + + if not remove_duplicates: + return df.drop_duplicates() + elif use_record_num: # If one patron id maps to two rows that are identical except for the # barcode, arbitrarily delete one of the rows + df = df.drop_duplicates( + ["patron_id", "record_num", "ptype_code", "pcode3", + "patron_home_library_code"]) + return df.drop_duplicates("record_num", keep=False) + else: + # If one patron id maps to two rows that are identical except for the + # barcode or record_num, arbitrarily delete one of the rows df = df.drop_duplicates( ["patron_id", "ptype_code", "pcode3", "patron_home_library_code"]) return df.drop_duplicates("patron_id", keep=False) - else: - return df.drop_duplicates() def get_sierra_patron_data_from_barcodes(sierra_client, barcodes, @@ -159,12 +174,12 @@ def get_sierra_patron_data_from_barcodes(sierra_client, barcodes, barcode_patron_id_df = barcodes_to_patron_ids( sierra_client, barcodes, False, True) patron_data_df = get_sierra_patron_data_from_ids( - sierra_client, barcode_patron_id_df["patron_id"], False, False) + sierra_client, barcode_patron_id_df["patron_id"], False, False, False) if isolate_connection: sierra_client.close_connection() # If one patron id maps to two rows that are identical except for the - # barcode, arbitrarily delete one of the rows + # barcode or record_num, arbitrarily delete one of the rows patron_data_df = patron_data_df.drop_duplicates( ["patron_id", "ptype_code", "pcode3", "patron_home_library_code"]) @@ -181,7 +196,8 @@ def get_sierra_patron_data_from_barcodes(sierra_client, barcodes, how="left", on="patron_id") df = pd.concat([perfect_match_df, imperfect_match_df], ignore_index=True) df.loc[df.duplicated("barcode", keep=False), [ - "ptype_code", "pcode3", "patron_home_library_code"]] = None + "record_num", "ptype_code", "pcode3", + "patron_home_library_code"]] = None return df.drop_duplicates("barcode") diff --git a/tests/test_patron_data_helper.py b/tests/test_patron_data_helper.py index 7ba1ed4..b29d026 100644 --- a/tests/test_patron_data_helper.py +++ b/tests/test_patron_data_helper.py @@ -19,36 +19,39 @@ ] _TEST_SIERRA_IDS_RESPONSE = [ - (1, "b1", 11, 12, "aa"), (2, "b2", 21, 22, "bb"), - (3, "b3", 31, 32, "cc"), (33, "b3", 331, 332, "ccc"), - (4, None, None, None, None), (5, "b5", 51, 52, "dd"), - (6, "b6", 61, 62, "ee"), (6, "b66", 61, 62, "ee"), - (7, "b7", 71, 72, "ff"), (7, "b77", 771, 772, "ffff"), - (None, "b4", None, None, None), (5, "b5", 51, 52, "dd"), + (1, "1", "b1", 11, 12, "aa"), (2, "2", "b2", 21, 22, "bb"), + (3, "3", "b3", 31, 32, "cc"), (33, "3", "b3", 331, 332, "ccc"), + (4, None, None, None, None, None), (5, "5", "b5", 51, 52, "dd"), + (6, "6", "b6", 61, 62, "ee"), (6, "6", "b66", 61, 62, "ee"), + (7, "7", "b7", 71, 72, "ff"), (7, "77", "b77", 771, 772, "ffff"), + (None, "4", "b4", None, None, None), (5, "5", "b5", 51, 52, "dd"), ] _TEST_BARCODE_DF = pd.DataFrame( - [[f"b{i}", str(i)] for i in range(1, 10)], + [[f"b{i}", str(i)] for i in range(1, 11)], columns=["barcode", "patron_id"]) _TEST_BARCODE_DF["patron_id"] = _TEST_BARCODE_DF["patron_id"].astype("string") _TEST_ID_DF = pd.DataFrame( - [["1", "b1", 11, 12, "aa"], # one perfect match - ["2", "b5", 21, 22, "bb"], # different id and barcode matches + [["1", "1", "b1", 11, 12, "aa"], # one perfect match + ["2", "2", "b5", 21, 22, "bb"], # different id and barcode matches # no match for patron id 3 - ["4", "b4", 41, 42, "dd"], # two matches -- one perfect, one imperfect - ["4", "b444", 43, 44, "dddd"], - ["5", "b555", 51, 52, "eeee"], # two matches -- both imperfect - ["5", "b556", 53, 54, "eeef"], - ["6", "b6", 61, 62, "ffff"], # two matches -- both perfect - ["6", "b6", 63, 64, "fffg"], - ["7", "b777", 71, 72, "gg"], # two matches -- same except barcode - ["7", "b778", 71, 72, "gg"], - ["8", None, 81, 82, "hh"], # one imperfect match/no barcode - ["9", "b9", None, None, None]], # one perfect match/all null fields - columns=["patron_id", "barcode", "ptype_code", "pcode3", + ["4", "4", "b4", 41, 42, "dd"], # two matches -- perfect and imperfect + ["4", "4", "b444", 43, 44, "dddd"], + ["5", "5", "b555", 51, 52, "eeee"], # two matches -- both imperfect + ["5", "5", "b556", 53, 54, "eeef"], + ["6", "6", "b6", 61, 62, "ffff"], # two matches -- both perfect + ["6", "6", "b6", 63, 64, "fffg"], + ["7", "7", "b777", 71, 72, "gg"], # two matches -- same but barcode + ["7", "7", "b778", 71, 72, "gg"], + ["8", "88", "b8", 81, 82, "hh"], # two matches -- same but record_num + ["8", "89", "b8", 81, 82, "hh"], + ["9", "9", None, 91, 92, "ii"], # one match/no barcode + ["10", "10", "b10", None, None, None]], # one match/all null fields + columns=["patron_id", "record_num", "barcode", "ptype_code", "pcode3", "patron_home_library_code"]) -_TEST_ID_DF["patron_id"] = _TEST_ID_DF["patron_id"].astype("string") +_TEST_ID_DF[["patron_id", "record_num"]] = _TEST_ID_DF[ + ["patron_id", "record_num"]].astype("string") class TestPatronDataHelper: @@ -108,16 +111,20 @@ def test_barcodes_to_patron_ids_with_duplicates(self, mocker): remove_duplicates=False )) - def test_get_sierra_patron_data_from_ids(self, mocker): + def test_get_sierra_patron_data_from_ids_pat_ids(self, mocker): RESULT = pd.DataFrame( - [["1", "b1", 11, 12, "aa"], ["2", "b2", 21, 22, "bb"], - ["3", "b3", 31, 32, "cc"], ["33", "b3", 331, 332, "ccc"], - ["4", None, None, None, None], - ["5", "b5", 51, 52, "dd"], ["6", "b6", 61, 62, "ee"], - ["6", "b66", 61, 62, "ee"], ["7", "b7", 71, 72, "ff"], - ["7", "b77", 771, 772, "ffff"]], - columns=["patron_id", "barcode", "ptype_code", "pcode3", - "patron_home_library_code"]) + [["1", "1", "b1", 11, 12, "aa"], + ["2", "2", "b2", 21, 22, "bb"], + ["3", "3", "b3", 31, 32, "cc"], + ["33", "3", "b3", 331, 332, "ccc"], + ["4", None, None, None, None, None], + ["5", "5", "b5", 51, 52, "dd"], + ["6", "6", "b6", 61, 62, "ee"], + ["6", "6", "b66", 61, 62, "ee"], + ["7", "7", "b7", 71, 72, "ff"], + ["7", "77", "b77", 771, 772, "ffff"]], + columns=["patron_id", "record_num", "barcode", "ptype_code", + "pcode3", "patron_home_library_code"]) RESULT["patron_id"] = RESULT["patron_id"].astype("string") mock_sierra_client = mocker.MagicMock() @@ -137,7 +144,48 @@ def test_get_sierra_patron_data_from_ids(self, mocker): # directly. The workaround is to test the total length of the query # plus that each id appears in it. query = mock_sierra_client.execute_query.call_args[0][0] - assert len(query) == 257 + assert len(query) == 269 + assert "WHERE id IN" in query + for el in range(1, 9): + assert str(el) in query + + def test_get_sierra_patron_data_from_ids_record_nums(self, mocker): + RESULT = pd.DataFrame( + [["1", "1", "b1", 11., 12., "aa"], + ["2", "2", "b2", 21., 22., "bb"], + ["3", "3", "b3", 31., 32., "cc"], + ["33", "3", "b3", 331., 332., "ccc"], + ["5", "5", "b5", 51., 52., "dd"], + ["6", "6", "b6", 61., 62., "ee"], + ["6", "6", "b66", 61., 62., "ee"], + ["7", "7", "b7", 71., 72., "ff"], + ["7", "77", "b77", 771., 772., "ffff"]], + columns=["patron_id", "record_num", "barcode", "ptype_code", + "pcode3", "patron_home_library_code"], + index=[0, 1, 2, 3, 5, 6, 7, 8, 9]) + RESULT[["patron_id", "record_num"]] = RESULT[ + ["patron_id", "record_num"]].astype("string") + + mock_sierra_client = mocker.MagicMock() + mock_sierra_client.execute_query.return_value = \ + _TEST_SIERRA_IDS_RESPONSE + + assert_frame_equal( + RESULT, get_sierra_patron_data_from_ids( + mock_sierra_client, [str(el) for el in range(1, 9)] + ["1",], + use_record_num=True, + )) + + mock_sierra_client.connect.assert_called_once() + mock_sierra_client.execute_query.assert_called_once() + mock_sierra_client.close_connection.assert_called_once() + + # Because the set of record_nums is unordered, it can't be tested + # directly. The workaround is to test the total length of the query + # plus that each id appears in it. + query = mock_sierra_client.execute_query.call_args[0][0] + assert len(query) == 277 + assert "WHERE record_num IN" in query for el in range(1, 9): assert str(el) in query @@ -152,14 +200,18 @@ def test_get_sierra_patron_data_from_ids_unisolated(self, mocker): mock_sierra_client.execute_query.assert_called_once() mock_sierra_client.close_connection.assert_not_called() - def test_get_sierra_patron_data_from_ids_without_duplicates(self, mocker): + def test_get_sierra_patron_data_from_ids_without_duplicates_pat_ids( + self, mocker): RESULT = pd.DataFrame( - [["1", "b1", 11, 12, "aa"], ["2", "b2", 21, 22, "bb"], - ["3", "b3", 31, 32, "cc"], ["33", "b3", 331, 332, "ccc"], - ["4", None, None, None, None], ["5", "b5", 51, 52, "dd"], - ["6", "b6", 61, 62, "ee"]], - columns=["patron_id", "barcode", "ptype_code", "pcode3", - "patron_home_library_code"]) + [["1", "1", "b1", 11, 12, "aa"], + ["2", "2", "b2", 21, 22, "bb"], + ["3", "3", "b3", 31, 32, "cc"], + ["33", "3", "b3", 331, 332, "ccc"], + ["4", None, None, None, None, None], + ["5", "5", "b5", 51, 52, "dd"], + ["6", "6", "b6", 61, 62, "ee"]], + columns=["patron_id", "record_num", "barcode", "ptype_code", + "pcode3", "patron_home_library_code"]) RESULT["patron_id"] = RESULT["patron_id"].astype("string") mock_sierra_client = mocker.MagicMock() @@ -172,22 +224,49 @@ def test_get_sierra_patron_data_from_ids_without_duplicates(self, mocker): remove_duplicates=True )) + def test_get_sierra_patron_data_from_ids_without_duplicates_record_nums( + self, mocker): + RESULT = pd.DataFrame( + [["1", "1", "b1", 11., 12., "aa"], + ["2", "2", "b2", 21., 22., "bb"], + ["5", "5", "b5", 51., 52., "dd"], + ["6", "6", "b6", 61., 62., "ee"], + ["7", "7", "b7", 71., 72., "ff"], + ["7", "77", "b77", 771., 772., "ffff"]], + columns=["patron_id", "record_num", "barcode", "ptype_code", + "pcode3", "patron_home_library_code"], + index=[0, 1, 5, 6, 8, 9]) + RESULT[["patron_id", "record_num"]] = RESULT[ + ["patron_id", "record_num"]].astype("string") + + mock_sierra_client = mocker.MagicMock() + mock_sierra_client.execute_query.return_value = \ + _TEST_SIERRA_IDS_RESPONSE + + assert_frame_equal( + RESULT, get_sierra_patron_data_from_ids( + mock_sierra_client, [str(el) for el in range(1, 9)], + remove_duplicates=True, use_record_num=True, + )) + def test_get_sierra_patron_data_from_barcodes(self, mocker): RESULT = pd.DataFrame( - [["b1", "1", 11, 12, "aa"], - ["b4", "4", 41, 42, "dd"], - ["b6", "6", None, None, None], - ["b9", "9", None, None, None], - ["b2", "2", 21, 22, "bb"], - ["b3", "3", None, None, None], - ["b5", "5", None, None, None], - ["b7", "7", 71, 72, "gg"], - ["b8", "8", 81, 82, "hh"]], - columns=["barcode", "patron_id", "ptype_code", "pcode3", - "patron_home_library_code"]) - RESULT["patron_id"] = RESULT["patron_id"].astype("string") - TEST_BARCODES = [f"b{i}" for i in range(1, 11)] + ["b1",] - TEST_IDS = pd.Series([str(i) for i in range(1, 10)], + [["b1", "1", "1", 11, 12, "aa"], + ["b4", "4", "4", 41, 42, "dd"], + ["b6", "6", None, None, None, None], + ["b8", "8", "88", 81, 82, "hh"], + ["b10", "10", "10", None, None, None], + ["b2", "2", "2", 21, 22, "bb"], + ["b3", "3", None, None, None, None], + ["b5", "5", None, None, None, None], + ["b7", "7", "7", 71, 72, "gg"], + ["b9", "9", "9", 91, 92, "ii"]], + columns=["barcode", "patron_id", "record_num", "ptype_code", + "pcode3", "patron_home_library_code"]) + RESULT[["patron_id", "record_num"]] = RESULT[ + ["patron_id", "record_num"]].astype("string") + TEST_BARCODES = [f"b{i}" for i in range(1, 12)] + ["b1",] + TEST_IDS = pd.Series([str(i) for i in range(1, 11)], dtype="string", name="patron_id") mocked_barcodes_method = mocker.patch( "nypl_py_utils.functions.patron_data_helper.barcodes_to_patron_ids", # noqa: E501