From 3f97f983acb2df16dd0b86018a703a0644eb4c4b Mon Sep 17 00:00:00 2001 From: Monique Rio Date: Mon, 29 Jul 2024 14:47:54 -0400 Subject: [PATCH] feat: incorporates remediated sh headings into db --- lib/authority_browse.rb | 2 + lib/authority_browse/remediated_subjects.rb | 133 +++++++++++++ .../solr_document/authority_graph.rb | 8 +- .../solr_document/subjects.rb | 2 +- lib/authority_browse/subjects.rb | 9 + lib/browse.rb | 7 +- solr/authority_browse/conf/managed-schema | 1 + spec/authority_browse/names_spec.rb | 3 - .../remediated_subjects_spec.rb | 178 ++++++++++++++++++ .../solr_document/subjects_spec.rb | 39 ++-- spec/authority_browse/subjects_spec.rb | 23 ++- spec/fixtures/remediated_subject.xml | 55 ++++++ spec/fixtures/remediated_subjects.xml | 2 + 13 files changed, 438 insertions(+), 24 deletions(-) create mode 100644 lib/authority_browse/remediated_subjects.rb create mode 100644 spec/authority_browse/remediated_subjects_spec.rb create mode 100644 spec/fixtures/remediated_subject.xml create mode 100644 spec/fixtures/remediated_subjects.xml diff --git a/lib/authority_browse.rb b/lib/authority_browse.rb index 3e7d899d..05897f92 100644 --- a/lib/authority_browse.rb +++ b/lib/authority_browse.rb @@ -7,6 +7,7 @@ require "services" require "concurrent" require "alma_rest_client" +require "marc" module AuthorityBrowse end @@ -20,3 +21,4 @@ module AuthorityBrowse require "authority_browse/base" require "authority_browse/names" require "authority_browse/subjects" +require "authority_browse/remediated_subjects" diff --git a/lib/authority_browse/remediated_subjects.rb b/lib/authority_browse/remediated_subjects.rb new file mode 100644 index 00000000..77f15df0 --- /dev/null +++ b/lib/authority_browse/remediated_subjects.rb @@ -0,0 +1,133 @@ +module AuthorityBrowse + class RemediatedSubjects + include Enumerable + + def initialize(file_path = S.remediated_subjects_file) + xml_lines = File.readlines(file_path) + @entries = xml_lines.map do |line| + Entry.new(line) + end + end + + def each(&block) + @entries.each(&block) + end + + class Entry + def initialize(xml) + @record = MARC::XMLReader.new(StringIO.new(xml)).first + end + + def id + @record["001"].value + end + + def preferred_term + @preferred_term ||= Term::Preferred.new(@record["150"]) + end + + def xrefs + @record.fields(["450", "550"]).map do |field| + [Term::SeeInstead, Term::Broader, Term::Narrower].find do |kind| + kind.match?(field) + end&.new(field) + end.compact + end + + def add_to_db + preferred_term.add_to_db(id) + xrefs.each do |xref| + xref.add_to_db(id) + end + end + end + + class Term + def initialize(field) + @field = field + end + + def kind + raise NotImplementedError + end + + def add_to_db(preferred_term_id) + if id == match_text + AuthorityBrowse.db[:subjects].insert(id: id, label: label, match_text: match_text, deprecated: false) + end + end + + def label + @field.subfields + .filter_map do |x| + x.value if ["a", "v", "x", "y", "z"].include?(x.code) + end + .join("--") + end + + def match_text + AuthorityBrowse::Normalize.match_text(label) + end + + def id + AuthorityBrowse.db[:subjects]&.first(match_text: match_text)&.dig(:id) || match_text + end + + class Preferred < Term + def add_to_db(id) + AuthorityBrowse.db[:subjects].insert(id: id, label: label, match_text: match_text, deprecated: false) + end + end + + class SeeInstead < Term + def self.match?(field) + field.tag == "450" + end + + def kind + "see_instead" + end + + def add_to_db(preferred_term_id) + super + xrefs = AuthorityBrowse.db[:subjects_xrefs] + xrefs.insert(subject_id: id, xref_id: preferred_term_id, xref_kind: kind) + end + end + + class Broader < Term + def self.match?(field) + field.tag == "550" && field["w"] == "g" + end + + def kind + "broader" + end + + def add_to_db(preferred_term_id) + super + xrefs = AuthorityBrowse.db[:subjects_xrefs] + xrefs.insert(subject_id: preferred_term_id, xref_id: id, xref_kind: kind) + xrefs.insert(subject_id: id, xref_id: preferred_term_id, xref_kind: "narrower") + end + end + + class Narrower < Term + def self.match?(field) + field.tag == "550" && field["w"] == "h" + end + + def kind + "narrower" + end + + def add_to_db(preferred_term_id) + super + xrefs = AuthorityBrowse.db[:subjects_xrefs] + xrefs.insert(subject_id: preferred_term_id, xref_id: id, xref_kind: kind) + xrefs.insert(subject_id: id, xref_id: preferred_term_id, xref_kind: "broader") + end + end + end + end +end diff --git a/lib/authority_browse/solr_document/authority_graph.rb b/lib/authority_browse/solr_document/authority_graph.rb index 8481b7b9..a83fdd44 100644 --- a/lib/authority_browse/solr_document/authority_graph.rb +++ b/lib/authority_browse/solr_document/authority_graph.rb @@ -33,9 +33,9 @@ def term # Library of Congress ID # - # @return [String] + # @return [String] if the id is a valid one def loc_id - first[:id] + first[:id] if first[:id]&.match?("loc.gov") end # @return [Integer] @@ -52,10 +52,10 @@ def xrefs @xrefs.map do |xref| [ xref, @data.filter_map do |x| - xref_count = x[:xref_count] + xref_count = x[:xref_count].to_i output = "#{x[:xref_label]}||#{xref_count}" if @kind.to_s == "name" - output unless xref_count.nil? || xref_count == 0 + output unless xref_count == 0 elsif x[:xref_kind] == xref.to_s output end diff --git a/lib/authority_browse/solr_document/subjects.rb b/lib/authority_browse/solr_document/subjects.rb index 189d47b4..ec2f0bd7 100644 --- a/lib/authority_browse/solr_document/subjects.rb +++ b/lib/authority_browse/solr_document/subjects.rb @@ -8,7 +8,7 @@ def self.kind # @return [Array] List of kinds of xrefs for subjects def self.xrefs - [:broader, :narrower] + [:broader, :narrower, :see_instead] end class AuthorityGraphSolrDocument < Subjects diff --git a/lib/authority_browse/subjects.rb b/lib/authority_browse/subjects.rb index 04fd783a..a39162e9 100644 --- a/lib/authority_browse/subjects.rb +++ b/lib/authority_browse/subjects.rb @@ -76,6 +76,15 @@ def reset_db(loc_file_getter = lambda { fetch_skos_file }) end end + def incorporate_remediated_subjects(file_path = S.remediated_subjects_file) + subjects = AuthorityBrowse::RemediatedSubjects.new(file_path) + AuthorityBrowse.db.transaction do + subjects.each do |entry| + entry.add_to_db + end + end + end + # Loads solr with documents of subjects that match data from Library of # Congress. # @param solr_uploader Solr::Uploader] diff --git a/lib/browse.rb b/lib/browse.rb index 88e3347d..002b5d8d 100644 --- a/lib/browse.rb +++ b/lib/browse.rb @@ -103,11 +103,16 @@ class Subjects < Thor desc "reset_db", "resets subjects skos tables" long_desc <<~DESC Downloads the latest version of the skosrdf data for subjects from the - Library of Congress. Reloads the tables :subjects and :names_see_also with + Library of Congress. Reloads the tables :subjects and :subjects_see_also with the new data. Gets rid of duplicate deprecated subjects. + Incorporates remediated subject headings. DESC def reset_db + S.logger.info "Loading Subjects from Library of Congress" AuthorityBrowse::Subjects.reset_db + S.logger.info "Loading remediated subjects" + AuthorityBrowse::Subjects.incorporate_remediated_subjects + S.logger.info "Finished Loading Subjects" end desc "update", "updates subjects tables with counts from biblio" diff --git a/solr/authority_browse/conf/managed-schema b/solr/authority_browse/conf/managed-schema index addabe3d..a2a6b652 100644 --- a/solr/authority_browse/conf/managed-schema +++ b/solr/authority_browse/conf/managed-schema @@ -76,6 +76,7 @@ + diff --git a/spec/authority_browse/names_spec.rb b/spec/authority_browse/names_spec.rb index f383057d..8f28e46b 100644 --- a/spec/authority_browse/names_spec.rb +++ b/spec/authority_browse/names_spec.rb @@ -62,7 +62,6 @@ expect(file_contents).to eq([ { id: "first\u001fname", - loc_id: "id1", browse_field: "name", term: "First", count: 1, @@ -71,7 +70,6 @@ }.to_json + "\n", { id: "second\u001fname", - loc_id: "id2", browse_field: "name", term: "Second", count: 2, @@ -79,7 +77,6 @@ }.to_json + "\n", { id: "third\u001fname", - loc_id: "id3", browse_field: "name", term: "Third", count: 3, diff --git a/spec/authority_browse/remediated_subjects_spec.rb b/spec/authority_browse/remediated_subjects_spec.rb new file mode 100644 index 00000000..c6072347 --- /dev/null +++ b/spec/authority_browse/remediated_subjects_spec.rb @@ -0,0 +1,178 @@ +RSpec.describe AuthorityBrowse::RemediatedSubjects do + subject do + described_class.new(File.join(S.project_root, "spec", "fixtures", "remediated_subjects.xml")) + end + + it "is enumerable" do + expect(subject.is_a?(Enumerable)).to eq(true) + end + + it "contains Entry objects" do + expect(subject.first.class).to eq(AuthorityBrowse::RemediatedSubjects::Entry) + end +end +RSpec.describe AuthorityBrowse::RemediatedSubjects::Entry do + before(:each) do + @subject_record = fixture("remediated_subject.xml") + end + subject do + described_class.new(@subject_record) + end + it "returns the mms_id for the #id" do + expect(subject.id).to eq("98187481368406381") + end + context "#preferred_term" do + it "returns the #label from 150$avxyz" do + expect(subject.preferred_term.label).to eq("Children of undocumented immigrants--Education--Law and legislation") + end + + it "returns the #match_text of the label" do + expect(subject.preferred_term.match_text).to eq("children of undocumented immigrants--education--law and legislation") + end + end + it "has xrefs" do + xrefs = subject.xrefs + expect(xrefs.count).to eq 3 + expect(xrefs[0].kind).to eq("see_instead") + expect(xrefs[1].kind).to eq("see_instead") + expect(xrefs[2].kind).to eq("broader") + end +end + +RSpec.describe AuthorityBrowse::RemediatedSubjects::Term do + let(:record) do + MARC::XMLReader.new(StringIO.new(fixture("remediated_subject.xml"))).first + end + let(:preferred_term) do + described_class::Preferred.new(record["150"]) + end + let(:term) do + record.fields("450")[1] + end + let(:xrefs_table) do + AuthorityBrowse.db[:subjects_xrefs] + end + let(:subjects_table) do + AuthorityBrowse.db[:subjects] + end + + before(:each) do + @term = term + end + subject do + described_class.new(@term) + end + it "has a label" do + expect(subject.label).to eq("Children of illegal aliens--Education--Law and legislation") + end + it "has a match_text" do + expect(subject.match_text).to eq("children of illegal aliens--education--law and legislation") + end + context "id" do + it "returns the id from the database if it exists" do + subjects = AuthorityBrowse.db[:subjects] + subjects.insert(id: "official_id", match_text: "children of illegal aliens--education--law and legislation") + expect(subject.id).to eq("official_id") + end + it "returns the match text if it doesn't exist" do + expect(subject.id).to eq("children of illegal aliens--education--law and legislation") + end + end + context "kind" do + it "raises NotImplemented error for base class" do + expect { subject.kind }.to raise_error(NotImplementedError) + end + end + context "Preferred" do + it "adds remediated subject to db" do + expect(AuthorityBrowse.db[:subjects].where(id: "preferred_term_id").any?).to eq(false) + preferred_term.add_to_db("preferred_term_id") + expect(AuthorityBrowse.db[:subjects].where(id: "preferred_term_id").any?).to eq(true) + end + end + context "SeeInstead" do + it "has kind :see_instead" do + expect(described_class::SeeInstead.new(@term).kind).to eq("see_instead") + end + it "updates the xrefs db" do + expect(xrefs_table.where(xref_kind: "see_instead").any?).to eq(false) + see_instead_inst = described_class::SeeInstead.new(@term) + see_instead_inst.add_to_db("preferred_term_id") + see_instead = xrefs_table.where(xref_kind: "see_instead").first + expect(see_instead[:xref_id]).to eq("preferred_term_id") + expect(see_instead[:subject_id]).to eq("children of illegal aliens--education--law and legislation") + expect(subjects_table.where(id: see_instead_inst.match_text).any?).to eq(true) + end + context "match?(field)" do + it "is true for a 450" do + expect(described_class::SeeInstead.match?(@term)).to eq(true) + end + it "is false for not 450" do + @term.tag = "550" + expect(described_class::SeeInstead.match?(@term)).to eq(false) + end + end + end + context "Broader" do + let(:broader_term) do + @term.tag = "550" + @term.append(MARC::Subfield.new("w", "g")) + @term + end + let(:broader_inst) do + described_class::Broader.new(broader_term) + end + it "has kind :broader" do + expect(broader_inst.kind).to eq("broader") + end + it "adds xrefs to db" do + broader_inst.add_to_db("preferred_field_id") + expect(xrefs_table.where(subject_id: "preferred_field_id").first[:xref_kind]).to eq("broader") + expect(xrefs_table.where(xref_id: "preferred_field_id").first[:xref_kind]).to eq("narrower") + expect(subjects_table.where(id: broader_inst.match_text).any?).to eq(true) + end + context "match?(field)" do + it "is true for a 550 with $wg" do + expect(described_class::Broader.match?(broader_term)).to eq(true) + end + it "is false for not 550" do + expect(described_class::Broader.match?(@term)).to eq(false) + end + it "is false for 550 without $wg" do + @term.tag = "550" + expect(described_class::Broader.match?(@term)).to eq(false) + end + end + end + context "Narrower" do + let(:narrower_term) do + @term.tag = "550" + @term.append(MARC::Subfield.new("w", "h")) + @term + end + let(:narrower_inst) do + described_class::Narrower.new(narrower_term) + end + it "has kind :broader" do + expect(narrower_inst.kind).to eq("narrower") + end + it "adds xrefs to db" do + narrower_inst.add_to_db("preferred_field_id") + expect(xrefs_table.where(subject_id: "preferred_field_id").first[:xref_kind]).to eq("narrower") + expect(xrefs_table.where(xref_id: "preferred_field_id").first[:xref_kind]).to eq("broader") + expect(subjects_table.where(id: narrower_inst.match_text).any?).to eq(true) + end + context "match?(field)" do + it "is true for a 550 with $wh" do + expect(described_class::Narrower.match?(narrower_term)).to eq(true) + end + it "is false for not 550" do + expect(described_class::Narrower.match?(@term)).to eq(false) + end + it "is false for 550 without $wh" do + @term.tag = "550" + expect(described_class::Narrower.match?(@term)).to eq(false) + end + end + end +end diff --git a/spec/authority_browse/solr_document/subjects_spec.rb b/spec/authority_browse/solr_document/subjects_spec.rb index 71b6030a..49352f67 100644 --- a/spec/authority_browse/solr_document/subjects_spec.rb +++ b/spec/authority_browse/solr_document/subjects_spec.rb @@ -72,9 +72,13 @@ end end context "#loc_id" do - it "returns the loc_id" do + it "returns the loc_id when the id is a loc id" do expect(subject.loc_id).to eq(counterpoint_id) end + it "returns nil when it's not the loc id" do + @subject[0][:id] = "9912351598" + expect(subject.loc_id).to be_nil + end end context "#term" do it "has the expected term" do @@ -95,39 +99,51 @@ it "has the broader and narrower terms and their count separated by ||" do expect(subject.xrefs).to eq({ broader: ["Music theory||50"], - narrower: ["Canon (Musical form)||30", "Cantus firmus||7"] + narrower: ["Canon (Musical form)||30", "Cantus firmus||7"], + see_instead: [] + }) + end + it "handles see_instead values" do + @subject[0][:xref_kind] = "see_instead" + expect(subject.xrefs).to eq({ + broader: [], + narrower: ["Canon (Musical form)||30", "Cantus firmus||7"], + see_instead: ["Music theory||50"] }) end - it "is empty when there are nil broaders" do + it "is shows broaders when there are nil broaders" do @subject = [ { id: counterpoint_id, match_text: "counterpoint", label: "Counterpoint", count: 1000, - broader_label: nil, - broader_count: nil + xref_label: "Music theory", + xref_count: nil, + xref_kind: "boader" } ] expect(subject.xrefs).to eq({ broader: [], - narrower: [] + narrower: [], + see_instead: [] }) end - it "is empty when broaders have a 0 count" do + it "is shows when broaders have a 0 count" do @subject = [ { id: counterpoint_id, match_text: "counterpoint", label: "Counterpoint", count: 1000, - broader_label: "something", - broader_count: 0 + xref_label: "something", + xref_count: 0 } ] expect(subject.xrefs).to eq({ broader: [], - narrower: [] + narrower: [], + see_instead: [] }) end end @@ -167,7 +183,8 @@ it "returns hash of xrefs with empty arrays" do expect(subject.xrefs).to eq({ broader: [], - narrower: [] + narrower: [], + see_instead: [] }) end end diff --git a/spec/authority_browse/subjects_spec.rb b/spec/authority_browse/subjects_spec.rb index 6c8780f1..c48ef9b8 100644 --- a/spec/authority_browse/subjects_spec.rb +++ b/spec/authority_browse/subjects_spec.rb @@ -78,7 +78,6 @@ expect(file_contents).to eq([ { id: "first\u001fsubject", - loc_id: "id1", browse_field: "subject", term: "First", count: 1, @@ -88,7 +87,6 @@ }.to_json + "\n", { id: "second\u001fsubject", - loc_id: "id2", browse_field: "subject", term: "Second", count: 2, @@ -96,7 +94,6 @@ }.to_json + "\n", { id: "third\u001fsubject", - loc_id: "id3", browse_field: "subject", term: "Third", count: 3, @@ -145,7 +142,25 @@ ]) end end + context ".incorporate_remediated_subjects" do + it "handles adding remediated_subjects" do + mms_id = "98187481368106381" + loc_id = "http://id.loc.gov/authorities/subjects/sh2008104250" + subjects = AuthorityBrowse.db[:subjects] + subxref = AuthorityBrowse.db[:subjects_xrefs] + subjects.insert(id: loc_id, label: "Illegal Aliens", match_text: "illegal aliens", count: 0) + + AuthorityBrowse::Subjects.incorporate_remediated_subjects(File.join(S.project_root, "spec", "fixtures", "remediated_subjects.xml")) + + remediated = subjects.where(id: mms_id).first + expect(remediated[:label]).to eq("Undocumented immigrants") + expect(remediated[:match_text]).to eq("undocumented immigrants") + expect(subxref.where(subject_id: loc_id).first[:xref_kind]).to eq("see_instead") + end + end after(:each) do - %x(if [ ! -z `ls /app/tmp/` ]; then rm tmp/*; fi) + Dir["#{S.project_root}/tmp/*"].each do |file| + File.delete(file) + end end end diff --git a/spec/fixtures/remediated_subject.xml b/spec/fixtures/remediated_subject.xml new file mode 100644 index 00000000..e76e9ece --- /dev/null +++ b/spec/fixtures/remediated_subject.xml @@ -0,0 +1,55 @@ + + + 01070nz a2200205n 4500 + 20230329125549.0 + 110701i| anannbabn |a ana + 98187481368406381 + + sh2011003293 + + + (DLC)sh2011003293 + + + (LIBRARY_OF_CONGRESS)98176413560000041 + + + DLC + eng + DLC + MiU + + + Children of undocumented immigrants + Education + Law and legislation + + + Children of undocumented foreign nationals + Education + Law and legislation + + + Children of illegal aliens + Education + Law and legislation + + + g + Educational law and legislation + + + Children of noncitizens + Education + Law and legislation + + + Work cat.: 2011028194: Olivas, Michael A. No undocumented child left behind, c2011. + + + a sla-lab updated based on DEIA Catalog Working Group changes May 2021; "Undocumented immigrants" term borrowed from Sears ; "undocumented foreign national" term from Bill H.R. 3776 (116th Congress) + + + sla-lab updated to include 550s for LCSH headings as references March 2023 + + diff --git a/spec/fixtures/remediated_subjects.xml b/spec/fixtures/remediated_subjects.xml new file mode 100644 index 00000000..7e91ad0e --- /dev/null +++ b/spec/fixtures/remediated_subjects.xml @@ -0,0 +1,2 @@ +01243cz a2200277n 450020230329132316.0020403i| anannbabn |a ana c98187481368006381sh 97006426(DLC) sh 97006426(LIBRARY_OF_CONGRESS)98170039530000041MnMHCLDLCWaElCMiUImmigrant detention centersAlien detention centersDetention centers, AlienDetention centers, NoncitizenDetention centers, ImmigrationImmigration detention centersUndocumented immigrant detention centersgDetention of personsNoncitizen detention centersIllegal immigrationLC database, Sept. 8, 1997(Oakdale Federal Alien Detention Center)WWW, Sept. 8, 1997(alien detention centers)sla-lab updated based on DEIA Catalog Working Group changes May 2021; "Undocumented immigrants" term borrowed from Sears ; "undocumented foreign national" term from Bill H.R. 3776 (116th Congress); "noncitizen" term from Bill H.R.1177 (117th Congress)sla-lab updated to include 550s for LCSH headings as references March 2023 +01200cz a2200301n 450020230329130030.0030627i| anannbabn |a ana 98187481368106381sh 85003553(DLC)sh 85003553(LIBRARY_OF_CONGRESS)98171057700000041DLCDLCDLCWaUUndocumented immigrantsUndocumented foreign nationalsIllegal aliensAliensLegal status, laws, etc.nneAliens, IllegalIllegal aliensLegal status, laws, etc.Illegal immigrantsUndocumented noncitizensgAliensImmigrant detention centersHuman smugglingNoncitizensIllegal immigrationWork cat.: 2007017970: Illegal immigration, 2007:eCIP data sheet (Illegal immigrants)sla-lab updated based on DEIA Catalog Working Group changes May 2021; "Undocumented immigrants" term borrowed from Sears ; "undocumented foreign national" term from Bill H.R. 3776 (116th Congress)sla-lab updated to include 550s for LCSH headings as references March 2023