From 75a4e4bf0072569de1fc1d2ace9c6725ae7243f0 Mon Sep 17 00:00:00 2001 From: klinga Date: Sun, 17 Mar 2024 23:00:06 -0400 Subject: [PATCH] renamed modules --- google_books/hathi_processor.py | 98 +++++++++++++++++++ google_books/hathi_report.py | 43 -------- .../{manifest.py => recap_manifest.py} | 0 tests/test_hathi_processor.py | 9 ++ tests/test_hathi_report.py | 24 ----- 5 files changed, 107 insertions(+), 67 deletions(-) create mode 100644 google_books/hathi_processor.py delete mode 100644 google_books/hathi_report.py rename google_books/{manifest.py => recap_manifest.py} (100%) create mode 100644 tests/test_hathi_processor.py delete mode 100644 tests/test_hathi_report.py diff --git a/google_books/hathi_processor.py b/google_books/hathi_processor.py new file mode 100644 index 0000000..57cf105 --- /dev/null +++ b/google_books/hathi_processor.py @@ -0,0 +1,98 @@ +import csv + +from google_books.marc_manipulator import marcxml_reader, save2marcxml +from google_books.utils import save2csv, fh_date + + +def find_bibno(line: str) -> str: + """Extracts Sierra bib # from the report""" + bibno = line[9:20] + if not bibno.startswith(".b"): + print(line) + raise ValueError("Invalid Sierra bib # encountered.") + return bibno + + +def find_cid(line: str) -> str: + """Extracts Hathi cid from the report""" + cid = line[-10:].strip() + if not cid.isdigit(): + raise ValueError("Invalid Hathi CID encountered.") + return cid + + +def parse_hathi_processing_report(fh: str) -> None: + """ + Parses Hathi job report and filters the results into three separate + files: hathi-success.csv, hathi-unspecified-oclc.csv, hathi-missing-oclc.csv. + + Args: + fh: path to processing report + """ + date = fh_date(fh) + with open(fh, "r") as file: + for line in file.readlines(): + if "new cid =" in line: + cid = find_cid(line) + save2csv(f"files/out/hathi-{date}-success.csv", [cid]) + elif line.startswith("WARNING: .b"): + bibno = find_bibno(line) + if "OCLC number found in unspecifed 035$" in line: + save2csv( + f"files/out/hathi-{date}-unspecified-oclc.csv", + [bibno], + ) + elif "no OCLC number in record" in line: + save2csv( + f"files/out/hathi-{date}-missing-oclc.csv", + [bibno], + ) + + +def google_reconciliation_to_barcodes_lst(fh: str) -> list[str]: + """ + Parses Google's *FOreconciled.txt report and returns a list + + Args: + fh: path to google reconciliation report + + Returns: + A list of rejected barcodes + """ + barcodes = [] + with open(fh, "r") as report: + reader = csv.reader(report) + next(reader) + for row in reader: + barcodes.append(row[0]) + return barcodes + + +def clean_metadata_for_hathi_submission( + metadata_fh: str, reconcile_report_fh: str, out: str +) -> None: + """ + Using Google's reconcilation FO report removes from a given metadata file records + that include rejected for digitizaiton items (barcodes). + + Args: + metadata_fh: path to marcxml file with records + reconcile_report_fh: path to google reconciliation FO report + """ + rejected_barcodes = google_reconciliation_to_barcodes_lst(reconcile_report_fh) + bibs = marcxml_reader(metadata_fh) + + bibs2keep = [] + for bib in bibs: + barcode = bib.get("945").get("i").strip() + if barcode not in rejected_barcodes: + bibs2keep.append(bib) + + save2marcxml(out, bibs2keep) + + +if __name__ == "__main__": + import sys + + print(sys.argv[1]) + clean_metadata_for_hathi_submission(sys.argv[1], sys.argv[2], sys.argv[3]) diff --git a/google_books/hathi_report.py b/google_books/hathi_report.py deleted file mode 100644 index 3ff6347..0000000 --- a/google_books/hathi_report.py +++ /dev/null @@ -1,43 +0,0 @@ -from google_books.utils import save2csv, fh_date - - -def find_bibno(line: str) -> str: - """Extracts Sierra bib # from the report""" - bibno = line[9:20] - if not bibno.startswith(".b"): - print(line) - raise ValueError("Invalid Sierra bib # encountered.") - return bibno - - -def find_cid(line: str) -> str: - """Extracts Hathi cid from the report""" - cid = line[-10:].strip() - if not cid.isdigit(): - raise ValueError("Invalid Hathi CID encountered.") - return cid - - -def parse_report(fh: str) -> None: - """ - Parses Hathi job report and filters the results into three separate - files: hathi-success.csv, hathi-unspecified-oclc.csv, hathi-missing-oclc.csv. - """ - date = fh_date(fh) - with open(fh, "r") as file: - for line in file.readlines(): - if "new cid =" in line: - cid = find_cid(line) - save2csv(f"files/out/hathi-{date}-success.csv", [cid]) - elif line.startswith("WARNING: .b"): - bibno = find_bibno(line) - if "OCLC number found in unspecifed 035$" in line: - save2csv( - f"files/out/hathi-{date}-unspecified-oclc.csv", - [bibno], - ) - elif "no OCLC number in record" in line: - save2csv( - f"files/out/hathi-{date}-missing-oclc.csv", - [bibno], - ) diff --git a/google_books/manifest.py b/google_books/recap_manifest.py similarity index 100% rename from google_books/manifest.py rename to google_books/recap_manifest.py diff --git a/tests/test_hathi_processor.py b/tests/test_hathi_processor.py new file mode 100644 index 0000000..1a75628 --- /dev/null +++ b/tests/test_hathi_processor.py @@ -0,0 +1,9 @@ +from google_books.hathi_processor import google_reconciliation_to_barcodes_lst + + +def test_google_reconciliation_to_barcodes_lst(): + sample_report = "tests/google-reconciliation-FO-report-sample.txt" + assert google_reconciliation_to_barcodes_lst(sample_report) == [ + "33433004338053", + "33433004727081", + ] diff --git a/tests/test_hathi_report.py b/tests/test_hathi_report.py deleted file mode 100644 index e77c0a5..0000000 --- a/tests/test_hathi_report.py +++ /dev/null @@ -1,24 +0,0 @@ -import pytest - - -from google_books.utils import fh_date - - -@pytest.mark.parametrize( - "arg,expectation", - [ - ("nyp_20231208_google.txt", "20231208"), - ("nyp_20240101_google_recap.txt", "20240101"), - ], -) -def test_report_name_base_valid(arg, expectation): - assert fh_date(arg) == expectation - - -@pytest.mark.parametrize( - "arg", - ["foo", "20231208_google.txt", "nyp_foo_google.txt", "nyp-20231208-google.txt"], -) -def test_report_name_base_invalid(arg): - with pytest.raises(ValueError): - fh_date(arg)