Skip to content

Commit

Permalink
feat: incorporates remediated sh headings into db
Browse files Browse the repository at this point in the history
  • Loading branch information
niquerio committed Jul 29, 2024
1 parent 73a5ffe commit 3f97f98
Show file tree
Hide file tree
Showing 13 changed files with 438 additions and 24 deletions.
2 changes: 2 additions & 0 deletions lib/authority_browse.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
require "services"
require "concurrent"
require "alma_rest_client"
require "marc"

module AuthorityBrowse
end
Expand All @@ -20,3 +21,4 @@ module AuthorityBrowse
require "authority_browse/base"
require "authority_browse/names"
require "authority_browse/subjects"
require "authority_browse/remediated_subjects"
133 changes: 133 additions & 0 deletions lib/authority_browse/remediated_subjects.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
module AuthorityBrowse
class RemediatedSubjects
include Enumerable

def initialize(file_path = S.remediated_subjects_file)
xml_lines = File.readlines(file_path)
@entries = xml_lines.map do |line|
Entry.new(line)
end
end

def each(&block)
@entries.each(&block)
end

class Entry
def initialize(xml)
@record = MARC::XMLReader.new(StringIO.new(xml)).first
end

def id
@record["001"].value
end

def preferred_term
@preferred_term ||= Term::Preferred.new(@record["150"])
end

def xrefs
@record.fields(["450", "550"]).map do |field|
[Term::SeeInstead, Term::Broader, Term::Narrower].find do |kind|
kind.match?(field)
end&.new(field)
end.compact
end

def add_to_db
preferred_term.add_to_db(id)
xrefs.each do |xref|
xref.add_to_db(id)
end
end
end

class Term
def initialize(field)
@field = field
end

def kind
raise NotImplementedError
end

def add_to_db(preferred_term_id)
if id == match_text
AuthorityBrowse.db[:subjects].insert(id: id, label: label, match_text: match_text, deprecated: false)
end
end

def label
@field.subfields
.filter_map do |x|
x.value if ["a", "v", "x", "y", "z"].include?(x.code)
end
.join("--")
end

def match_text
AuthorityBrowse::Normalize.match_text(label)
end

def id
AuthorityBrowse.db[:subjects]&.first(match_text: match_text)&.dig(:id) || match_text
end

class Preferred < Term
def add_to_db(id)
AuthorityBrowse.db[:subjects].insert(id: id, label: label, match_text: match_text, deprecated: false)
end
end

class SeeInstead < Term
def self.match?(field)
field.tag == "450"
end

def kind
"see_instead"
end

def add_to_db(preferred_term_id)
super
xrefs = AuthorityBrowse.db[:subjects_xrefs]
xrefs.insert(subject_id: id, xref_id: preferred_term_id, xref_kind: kind)
end
end

class Broader < Term
def self.match?(field)
field.tag == "550" && field["w"] == "g"
end

def kind
"broader"
end

def add_to_db(preferred_term_id)
super
xrefs = AuthorityBrowse.db[:subjects_xrefs]
xrefs.insert(subject_id: preferred_term_id, xref_id: id, xref_kind: kind)
xrefs.insert(subject_id: id, xref_id: preferred_term_id, xref_kind: "narrower")
end
end

class Narrower < Term
def self.match?(field)
field.tag == "550" && field["w"] == "h"
end

def kind
"narrower"
end

def add_to_db(preferred_term_id)
super
xrefs = AuthorityBrowse.db[:subjects_xrefs]
xrefs.insert(subject_id: preferred_term_id, xref_id: id, xref_kind: kind)
xrefs.insert(subject_id: id, xref_id: preferred_term_id, xref_kind: "broader")
end
end
end
end
end
8 changes: 4 additions & 4 deletions lib/authority_browse/solr_document/authority_graph.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def term

# Library of Congress ID
#
# @return [String]
# @return [String] if the id is a valid one
def loc_id
first[:id]
first[:id] if first[:id]&.match?("loc.gov")
end

# @return [Integer]
Expand All @@ -52,10 +52,10 @@ def xrefs
@xrefs.map do |xref|
[
xref, @data.filter_map do |x|
xref_count = x[:xref_count]
xref_count = x[:xref_count].to_i
output = "#{x[:xref_label]}||#{xref_count}"
if @kind.to_s == "name"
output unless xref_count.nil? || xref_count == 0
output unless xref_count == 0
elsif x[:xref_kind] == xref.to_s
output
end
Expand Down
2 changes: 1 addition & 1 deletion lib/authority_browse/solr_document/subjects.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def self.kind

# @return [Array<Symbol>] List of kinds of xrefs for subjects
def self.xrefs
[:broader, :narrower]
[:broader, :narrower, :see_instead]
end

class AuthorityGraphSolrDocument < Subjects
Expand Down
9 changes: 9 additions & 0 deletions lib/authority_browse/subjects.rb
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,15 @@ def reset_db(loc_file_getter = lambda { fetch_skos_file })
end
end

def incorporate_remediated_subjects(file_path = S.remediated_subjects_file)
subjects = AuthorityBrowse::RemediatedSubjects.new(file_path)
AuthorityBrowse.db.transaction do
subjects.each do |entry|
entry.add_to_db
end
end
end

# Loads solr with documents of subjects that match data from Library of
# Congress.
# @param solr_uploader Solr::Uploader]
Expand Down
7 changes: 6 additions & 1 deletion lib/browse.rb
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,16 @@ class Subjects < Thor
desc "reset_db", "resets subjects skos tables"
long_desc <<~DESC
Downloads the latest version of the skosrdf data for subjects from the
Library of Congress. Reloads the tables :subjects and :names_see_also with
Library of Congress. Reloads the tables :subjects and :subjects_see_also with
the new data. Gets rid of duplicate deprecated subjects.
Incorporates remediated subject headings.
DESC
def reset_db
S.logger.info "Loading Subjects from Library of Congress"
AuthorityBrowse::Subjects.reset_db
S.logger.info "Loading remediated subjects"
AuthorityBrowse::Subjects.incorporate_remediated_subjects
S.logger.info "Finished Loading Subjects"
end

desc "update", "updates subjects tables with counts from biblio"
Expand Down
1 change: 1 addition & 0 deletions solr/authority_browse/conf/managed-schema
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
<field name="alternate_forms" type="string" stored="true" indexed="false" multiValued="true"/>
<field name="see_also" type="string" indexed="true" stored="true" multiValued="true" />
<field name="incoming_see_also" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="see_instead" type="string" indexed="true" stored="true" multiValued="true" />
<field name="broader" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="narrower" type="string" indexed="true" stored="true" multiValued="true"/>

Expand Down
3 changes: 0 additions & 3 deletions spec/authority_browse/names_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
expect(file_contents).to eq([
{
id: "first\u001fname",
loc_id: "id1",
browse_field: "name",
term: "First",
count: 1,
Expand All @@ -71,15 +70,13 @@
}.to_json + "\n",
{
id: "second\u001fname",
loc_id: "id2",
browse_field: "name",
term: "Second",
count: 2,
date_of_index: Date.today.strftime("%Y-%m-%d") + "T00:00:00Z"
}.to_json + "\n",
{
id: "third\u001fname",
loc_id: "id3",
browse_field: "name",
term: "Third",
count: 3,
Expand Down
Loading

0 comments on commit 3f97f98

Please sign in to comment.