Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
klinga committed Jun 14, 2024
2 parents 646e78d + 46bbb09 commit 458a665
Show file tree
Hide file tree
Showing 11 changed files with 295 additions and 35 deletions.
11 changes: 11 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"cSpell.words": [
"hathi",
"Zephir"
],
"cSpell.ignoreWords": [
"bibno",
"oclcno",
"rlin"
]
}
2 changes: 2 additions & 0 deletions files/picklist/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
15 changes: 15 additions & 0 deletions google_books/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
prep_onsite_manifest_for_google,
prep_recap_manifest_for_sierra_list,
)
from google_books.picklist import prep_item_list_for_sierra


__version__ = "0.1.0"
Expand Down Expand Up @@ -77,6 +78,20 @@ def onsite_manifest(filename: str) -> None:
click.echo(f"Prepped manifest was saved to {out.resolve()}")


@cli.command()
@click.argument("tar_file", type=click.Path(exists=True))
def get_candidate_items(tar_file: str) -> None:
"""
Prepares the item list for Sierra based on Google Candidate list _combined tar file.
Creates `nypl-YYYY-MM-DD-candidate-items.csv` file with item numbers in the `picklist`
folder.
Args:
tar_file (str): The tar file containing the candidate list.
"""
prep_item_list_for_sierra(tar_file)
click.echo("Candidate items have been saved to files/picklist/ directory.")


@cli.command()
@click.argument("marcxml_submitted", type=click.Path(exists=True))
@click.argument("marcxml_errors", type=click.Path(exists=True))
Expand Down
40 changes: 34 additions & 6 deletions google_books/hathi_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def find_bibno(line: str) -> str:
"""Extracts Sierra bib # from the report"""
bibno_idx = line.find(".b")
if bibno_idx >= 0:
bibno = line[bibno_idx : bibno_idx + 11]
bibno = line[bibno_idx + 1 : bibno_idx + 11]
else:
raise ValueError(f"Invalid Sierra bib # encountered. Line: {line}")
return bibno
Expand All @@ -26,7 +26,7 @@ def find_err_msg(line: str) -> str:
"""Extracts error message given in Zephir report"""
if line.startswith("ERROR"):
err_idx = line.find("): ")
return line[err_idx + 3 :].strip()
return line[err_idx + 3 :].strip() # noqa: E203
else:
return ""

Expand All @@ -40,29 +40,57 @@ def parse_hathi_processing_report(fh: str) -> None:
fh: path to processing report
"""
date = fh_date(fh)
succ_count = 0
succ_fh = f"files/out/hathi-{date}-success.csv"
inval_oclc_loc_count = 0
inval_oclc_fh = f"files/out/hathi-{date}-unspecified-oclc.csv"
miss_oclc_count = 0
miss_oclc_fh = f"files/out/hathi-{date}-missing-oclc.csv"
err_count = 0
err_fh = f"files/out/hathi-{date}-errors.csv"
with open(fh, "r") as file:
for line in file.readlines():
if "new cid =" in line:
cid = find_cid(line)
save2csv(f"files/out/hathi-{date}-success.csv", ",", [cid])
save2csv(succ_fh, ",", [cid])
succ_count += 1
elif line.startswith("WARNING: .b"):
bibno = find_bibno(line)
if "OCLC number found in unspecified 035$" in line:
save2csv(
f"files/out/hathi-{date}-unspecified-oclc.csv",
inval_oclc_fh,
",",
[bibno],
)
inval_oclc_loc_count += 1
elif "no OCLC number in record" in line:
save2csv(
f"files/out/hathi-{date}-missing-oclc.csv",
miss_oclc_fh,
",",
[bibno],
)
miss_oclc_count += 1
elif line.startswith("ERROR"):
bibno = find_bibno(line)
err_msg = find_err_msg(line)
save2csv(f"files/out/hathi-{date}-errors.csv", "'", [bibno, err_msg])
save2csv(
err_fh,
",",
[
bibno,
f"{date}",
"Zephir validation",
f"nyp_{date}_google.xml",
err_msg,
"NO",
],
)
err_count += 1
print(f"Report:\n\t")
print(f"success: {succ_count} ({succ_fh})\n")
print(f"\tOCLC in 035 only: {inval_oclc_loc_count} ({inval_oclc_fh})\n")
print(f"\tmissing OCLC#: {miss_oclc_count} ({miss_oclc_fh})\n")
print(f"\trejected: {err_count} ({err_fh})")


def google_reconciliation_to_barcodes_lst(fh: str) -> list[str]:
Expand Down
13 changes: 10 additions & 3 deletions google_books/marc_manipulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ def create_stub_hathi_records(
out: path to MARC21 file with output stub records
"""
invalid_bibs = get_invalid_bib_nos(marcxml_errors)
print(f"Removing {len(invalid_bibs)} invalid records from the submission file.")
total_out_bibs = Counter()
print(f"Found {len(invalid_bibs)} rejected record(s) by Zephir.")
total_out_bibs = Counter() # type: ignore

for bib in marcxml_reader(marcxml_submitted):

Expand Down Expand Up @@ -187,7 +187,14 @@ def get_invalid_bib_nos(marcxml_error: str) -> list[str]:
Args:
marcxml_error: path to marxml with invalid bibs provided by Hathi
"""
return [bib.get("907").get("a") for bib in marcxml_reader(marcxml_error)]
try:
bibNos = []
for bib in marcxml_reader(marcxml_error):
bibNos.append(bib.get("907").get("a")) # type: ignore
return bibNos
except AttributeError:
warnings.warn("Encountered bib without Sierra bib # in 907$a).")
return []


def marcxml_reader(fh: str) -> Iterator[Record]:
Expand Down
48 changes: 48 additions & 0 deletions google_books/picklist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""
A module with methods to unpack Google Candidate List and create
NYPL pick list.
"""

import csv
import glob
import tarfile
import warnings

from google_books.utils import fh_date, save2csv


def extract_candidate_list(tar_file: str) -> None:
"""
Extracts the candidate list from the tar file.
Args:
tar_file (str): The tar file containing the candidate list.
"""
with tarfile.open(tar_file, "r") as tar:
tar.extractall("files/picklist")


def prep_item_list_for_sierra(tar_file: str) -> None:
"""
Prepares the item list for Sierra based on Google Candidate list _combined tar file.
Creates `nypl-YYYY-MM-DD-candidate-items.csv` file with item numbers in the `picklist`
folder.
Args:
tar_file (str): The tar file containing the candidate list.
"""
date = fh_date(tar_file)
out = f"files/picklist/nypl-{date}-candidate-items.csv"

extract_candidate_list(tar_file)

# read each extracted .txt file, find item #, and write it to a new file
files = glob.glob("files/picklist/*_combined-*.txt")
for f in files:
reader = csv.reader(open(f, "r", encoding="utf-8"), delimiter="\t")
for row in reader:
item = row[1][1:]
save2csv(out, ",", [item])


if __name__ == "__main__":
prep_item_list_for_sierra("files/picklist/nypl-2024-05-15_combined.tar.gz")
24 changes: 20 additions & 4 deletions google_books/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import csv
from datetime import datetime

import click

Expand Down Expand Up @@ -29,13 +30,28 @@ def fh_date(fh: str) -> str:
"""Creates base name for analysis report files"""
err_msg = (
"The name of the file to be parsed is invalid. "
"Correct pattern: 'nyp_YYYYMMDD_google'."
"Correct pattern: 'nyp_YYYYMMDD_google' or 'files/picklist/nypl-YYYY-MM-DD_'"
)
try:
fh_str = click.format_filename(fh)
fh_date = fh_str.split("_")[1]
if fh_str.startswith("files/picklist/nypl-"):
fh_date = fh_str.split("_")[0][20:]
else:
fh_date = fh_str.split("_")[1]
except IndexError:
raise ValueError(err_msg)
if not fh_date.isdigit():

# check if the date is in the correct format
try:
datetime.strptime(fh_date, "%Y%m%d")
except ValueError:
pass
else:
return fh_date

try:
datetime.strptime(fh_date, "%Y-%m-%d")
except ValueError:
raise ValueError(err_msg)
return fh_date
else:
return fh_date
4 changes: 2 additions & 2 deletions tests/marcxml-sample-no-barcode.xml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
<marc:subfield code="h">*MGSB (Colby, G. Conflict. 1930)</marc:subfield>
</marc:datafield>
<marc:datafield tag="907" ind1=" " ind2=" ">
<marc:subfield code="a">.b122776471</marc:subfield>
<marc:subfield code="a">.b122776470</marc:subfield>
<marc:subfield code="b">07-18-23</marc:subfield>
<marc:subfield code="c">12-15-2008 15:35</marc:subfield>
</marc:datafield>
Expand Down Expand Up @@ -91,7 +91,7 @@
<marc:subfield code="y">2386549</marc:subfield>
</marc:datafield>
<marc:datafield tag="945" ind1=" " ind2=" ">
<marc:subfield code="a">.b122776471</marc:subfield>
<marc:subfield code="a">.b122776470</marc:subfield>
</marc:datafield>
</marc:record>
</marc:collection>
123 changes: 123 additions & 0 deletions tests/marcxml-sample-one-bib.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
<leader>01291nam a2200325zi 4500</leader>
<controlfield tag="001">NYPY710535216-B</controlfield>
<controlfield tag="003">CStRLIN</controlfield>
<controlfield tag="005">19990811185213.2</controlfield>
<controlfield tag="008">711126c19301921xxua| |b |0|| | eng d</controlfield>
<datafield tag="035" ind1=" " ind2=" ">
<subfield code="a">(NN-PD)710535216</subfield>
</datafield>
<datafield tag="035" ind1=" " ind2=" ">
<subfield code="a">(WaOLN)nyp2264446</subfield>
</datafield>
<datafield tag="035" ind1=" " ind2=" ">
<subfield code="a">(OCoLC)2386549</subfield>
</datafield>
<datafield tag="040" ind1=" " ind2=" ">
<subfield code="a">NN-PD</subfield>
<subfield code="c">NN-PD</subfield>
<subfield code="d">CStRLIN</subfield>
<subfield code="d">WaOLN</subfield>
</datafield>
<datafield tag="100" ind1="1" ind2=" ">
<subfield code="a">Colby, Gertrude K.</subfield>
</datafield>
<datafield tag="245" ind1="1" ind2="4">
<subfield code="a">The conflict;</subfield>
<subfield code="b">a health masque in pantomime, by Gertrude K. Colby with an introduction by Thomas D. Wood.</subfield>
</datafield>
<datafield tag="260" ind1=" " ind2=" ">
<subfield code="a">New York,</subfield>
<subfield code="b">A. S. Barnes and Co.,</subfield>
<subfield code="c">1930 [c1921]</subfield>
</datafield>
<datafield tag="300" ind1=" " ind2=" ">
<subfield code="a">70 p.</subfield>
<subfield code="b">illus., diagrs.</subfield>
<subfield code="c">24 cm.</subfield>
</datafield>
<datafield tag="504" ind1=" " ind2=" ">
<subfield code="a">Bibliography: p. 70.</subfield>
</datafield>
<datafield tag="690" ind1=" " ind2="4">
<subfield code="a">Masques (Works).</subfield>
<subfield code="b">The conflict.</subfield>
</datafield>
<datafield tag="690" ind1=" " ind2="4">
<subfield code="a">Children's pageants, masques, plays.</subfield>
</datafield>
<datafield tag="690" ind1=" " ind2="4">
<subfield code="a">Children's pantomimes.</subfield>
</datafield>
<datafield tag="799" ind1="0" ind2=" ">
<subfield code="a">Gift of Alice A. Sefton.</subfield>
</datafield>
<datafield tag="852" ind1="8" ind2=" ">
<subfield code="h">*MGSB (Colby, G. Conflict. 1930)</subfield>
</datafield>
<datafield tag="907" ind1=" " ind2=" ">
<subfield code="a">.b122776471</subfield>
<subfield code="b">07-18-23</subfield>
<subfield code="c">12-15-2008 15:35</subfield>
</datafield>
<datafield tag="998" ind1=" " ind2=" ">
<subfield code="a">pad</subfield>
<subfield code="b">09-05-99</subfield>
<subfield code="c">m</subfield>
<subfield code="d">a </subfield>
<subfield code="e">-</subfield>
<subfield code="f">eng</subfield>
<subfield code="g">xxu</subfield>
<subfield code="h">4</subfield>
<subfield code="i">1</subfield>
</datafield>
<datafield tag="959" ind1=" " ind2=" ">
<subfield code="a">.b37554542</subfield>
<subfield code="b">08-28-07</subfield>
<subfield code="c">09-23-95</subfield>
</datafield>
<datafield tag="910" ind1=" " ind2=" ">
<subfield code="a">RL</subfield>
</datafield>
<datafield tag="997" ind1=" " ind2=" ">
<subfield code="a">pd</subfield>
<subfield code="b">09-05-99</subfield>
<subfield code="c">m</subfield>
<subfield code="d">a</subfield>
<subfield code="e">-</subfield>
<subfield code="f">eng</subfield>
<subfield code="g">xxu</subfield>
<subfield code="h">4</subfield>
</datafield>
<datafield tag="991" ind1=" " ind2=" ">
<subfield code="y">2386549</subfield>
</datafield>
<datafield tag="945" ind1=" " ind2=" ">
<subfield code="a">*MGSB (Colby, G. Conflict. 1930)</subfield>
<subfield code="d"> - - </subfield>
<subfield code="e">0</subfield>
<subfield code="f">12-06-2023 8:43</subfield>
<subfield code="g">1</subfield>
<subfield code="h">03-05-24</subfield>
<subfield code="i">33433010141525</subfield>
<subfield code="j">221</subfield>
<subfield code="k"> - - </subfield>
<subfield code="l">rcpd2</subfield>
<subfield code="o">-</subfield>
<subfield code="p">$0.00</subfield>
<subfield code="q">-</subfield>
<subfield code="r">2</subfield>
<subfield code="s">- </subfield>
<subfield code="t">2</subfield>
<subfield code="u">2</subfield>
<subfield code="v">0</subfield>
<subfield code="w">1</subfield>
<subfield code="x">0</subfield>
<subfield code="y">.i105014709</subfield>
<subfield code="z">02-03-2009 5:58</subfield>
<subfield code="2">0</subfield>
<subfield code="1">814</subfield>
</datafield>
</record>
</collection>
Loading

0 comments on commit 458a665

Please sign in to comment.