diff --git a/Concordance-RVK-Verbundbibliothek/bulk.csv b/Concordance-RVK-Verbundbibliothek/bulk.csv new file mode 100644 index 0000000..cab0361 --- /dev/null +++ b/Concordance-RVK-Verbundbibliothek/bulk.csv @@ -0,0 +1,3 @@ +"HT013166356","CI 5310,CI 5603,CI 1100,CI 1125,CI 5604,EC 2430,IH 34381" +"HT018625006","CI 5310,CI 5603,CI 1100,CI 1125,CI 5604,EC 2430,IH 34381" +"TT000577460","CI 5310,CI 5603,CI 1100,CI 1125,CI 5604,EC 2430,IH 34381" diff --git a/Concordance-RVK-Verbundbibliothek/bulk.json b/Concordance-RVK-Verbundbibliothek/bulk.json new file mode 100644 index 0000000..4a10c81 --- /dev/null +++ b/Concordance-RVK-Verbundbibliothek/bulk.json @@ -0,0 +1 @@ +"HT013166356, HT018625006, TT000577460","CI 5310","CI 5603","CI 1100","CI 1125","CI 5603","CI 5604","EC 2430","IH 34381" diff --git a/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux b/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux new file mode 100644 index 0000000..20d1cd2 --- /dev/null +++ b/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux @@ -0,0 +1,28 @@ +// Die flux filtert mittels morph-cg-to-es.xml die Isil DE-605 aus +// This flux uses morph-cg-to-es.xml to filter records with holdings +// by Isil DE-605 from culturegraph aggregate marcxml. It then builds +// a concordance Id<->RVK which can be indexed directly into elasticsearch. +// Snippet from the output json: +// +//{"index":{"_index":"cgrvk","_type":"rvk"}} +//{"rvk":["CI 1100","5,1"],"hbzId":"HT018839495, HT018625006"} +// +// Use curl to bulk load the file: +// +// curl -XPOST --header 'Content-Type: application/x-ndjson' -d @bulk.ndjson 'http://localhost:9200/_bulk' + +default outfile = FLUX_DIR + "bulk.csv"; +default infile = FLUX_DIR + "aggregate_auslieferung_20191212.small.marcxml.gz"; +default fixfile = FLUX_DIR + "fix-cg-to-es.fix"; + + +infile +| open-file +| decode-xml +| handle-marcxml +| fix(fixfile) +| encode-json +| decode-json(recordPath="records") +| encode-csv +| write(outfile) +; \ No newline at end of file diff --git a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix index d286aab..9083556 100644 --- a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix +++ b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix @@ -1,33 +1,42 @@ -set_array("rvk[]") +set_array("records[]") +set_array("@id[]") +set_array("@rvk") do list(path: "084??", "var": "$i") if any_match("$i.2", "rvk") - copy_field("$i.a","rvk[].$append") + copy_field("$i.a","@rvk.$append") end end -set_array("id") +uniq("@rvk") +join_field("@rvk",",") + + do list(path: "035??", "var": "$i") if any_match("$i.a", "^\\(DE-605\\)(.*)") - copy_field("$i.a","id.$append") + copy_field("$i.a","@id[].$append") end end -replace_all("id.*","^\\(DE-605\\)(.*)","$1") -join_field("id",", ") +replace_all("id[].*","^\\(DE-605\\)(.*)","$1") + +do list(path: "@id[]", "var": "$i") + copy_field("$i","records[].$append.id") + copy_field("@rvk","records[].$last.rvk") +end +replace_all("records[].*.id","^\\(DE-605\\)(.*)","$1") -retain("rvk[]","id") vacuum() # Filter records without RVK -unless exists("rvk[]") +unless exists("@rvk") reject() end # Filter records without hbz ids -unless exists("id") +unless exists("@id[]") reject() end - +retain("records[]")