From 26a77348f4ac33c75a8216e523c984b74da406df Mon Sep 17 00:00:00 2001 From: TobiasNx Date: Wed, 29 May 2024 08:58:33 +0200 Subject: [PATCH 1/6] Moving set_array for id does not change the order --- Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix index d286aab..6ee6302 100644 --- a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix +++ b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix @@ -1,3 +1,4 @@ +set_array("id") set_array("rvk[]") do list(path: "084??", "var": "$i") @@ -6,7 +7,6 @@ do list(path: "084??", "var": "$i") end end -set_array("id") do list(path: "035??", "var": "$i") if any_match("$i.a", "^\\(DE-605\\)(.*)") copy_field("$i.a","id.$append") From 1feababb20f1b56aa50aa381fe7dcfc2d09130ab Mon Sep 17 00:00:00 2001 From: TobiasNx Date: Wed, 29 May 2024 09:00:10 +0200 Subject: [PATCH 2/6] Renaming the element does not change the order --- Concordance-RVK-Verbundbibliothek/bulk.ndjson | 2 +- Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Concordance-RVK-Verbundbibliothek/bulk.ndjson b/Concordance-RVK-Verbundbibliothek/bulk.ndjson index 82067bb..17571ae 100644 --- a/Concordance-RVK-Verbundbibliothek/bulk.ndjson +++ b/Concordance-RVK-Verbundbibliothek/bulk.ndjson @@ -1,2 +1,2 @@ {"index":{"_index":"cgrvk","_type":"rvk"}} -{"rvk":["CI 5310","CI 5603","CI 1100","CI 1125","CI 5603","CI 5604","EC 2430","IH 34381"],"id":"HT013166356, HT018625006, TT000577460"} +{"rvk":["CI 5310","CI 5603","CI 1100","CI 1125","CI 5603","CI 5604","EC 2430","IH 34381"],"@id":"HT013166356, HT018625006, TT000577460"} diff --git a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix index 6ee6302..8d00503 100644 --- a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix +++ b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix @@ -1,4 +1,4 @@ -set_array("id") +set_array("@id") set_array("rvk[]") do list(path: "084??", "var": "$i") @@ -9,13 +9,13 @@ end do list(path: "035??", "var": "$i") if any_match("$i.a", "^\\(DE-605\\)(.*)") - copy_field("$i.a","id.$append") + copy_field("$i.a","@id.$append") end end -replace_all("id.*","^\\(DE-605\\)(.*)","$1") -join_field("id",", ") +replace_all("@id.*","^\\(DE-605\\)(.*)","$1") +join_field("@id",", ") -retain("rvk[]","id") +retain("rvk[]","@id") vacuum() # Filter records without RVK @@ -23,8 +23,8 @@ unless exists("rvk[]") reject() end -# Filter records without hbz ids -unless exists("id") +# Filter records without hbz @ids +unless exists("@id") reject() end From 1bedc2cb0930bee9a9b89229ec3529cd70e77bd7 Mon Sep 17 00:00:00 2001 From: TobiasNx Date: Wed, 29 May 2024 09:02:52 +0200 Subject: [PATCH 3/6] Using a helper element does not do the trick Thought the change with `join_field` from array to string would cause the oder problem but it does not. --- Concordance-RVK-Verbundbibliothek/bulk.ndjson | 2 +- Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Concordance-RVK-Verbundbibliothek/bulk.ndjson b/Concordance-RVK-Verbundbibliothek/bulk.ndjson index 17571ae..82067bb 100644 --- a/Concordance-RVK-Verbundbibliothek/bulk.ndjson +++ b/Concordance-RVK-Verbundbibliothek/bulk.ndjson @@ -1,2 +1,2 @@ {"index":{"_index":"cgrvk","_type":"rvk"}} -{"rvk":["CI 5310","CI 5603","CI 1100","CI 1125","CI 5603","CI 5604","EC 2430","IH 34381"],"@id":"HT013166356, HT018625006, TT000577460"} +{"rvk":["CI 5310","CI 5603","CI 1100","CI 1125","CI 5603","CI 5604","EC 2430","IH 34381"],"id":"HT013166356, HT018625006, TT000577460"} diff --git a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix index 8d00503..96607cd 100644 --- a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix +++ b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix @@ -1,3 +1,4 @@ +add_field("id","") set_array("@id") set_array("rvk[]") @@ -14,8 +15,9 @@ do list(path: "035??", "var": "$i") end replace_all("@id.*","^\\(DE-605\\)(.*)","$1") join_field("@id",", ") +move_field("@id","id") -retain("rvk[]","@id") +retain("rvk[]","id") vacuum() # Filter records without RVK @@ -24,7 +26,7 @@ unless exists("rvk[]") end # Filter records without hbz @ids -unless exists("@id") +unless exists("id") reject() end From 239469d6bfe09541c716f9961f446b982fdec982 Mon Sep 17 00:00:00 2001 From: TobiasNx Date: Wed, 29 May 2024 10:09:42 +0200 Subject: [PATCH 4/6] Add workflow for csv --- Concordance-RVK-Verbundbibliothek/bulk.csv | 1 + Concordance-RVK-Verbundbibliothek/bulk.json | 1 + ...Rvk-Verbundbibliothek_concordance_csv.flux | 27 +++++++++++++++++++ .../fix-cg-to-es.fix | 13 +++++---- 4 files changed, 35 insertions(+), 7 deletions(-) create mode 100644 Concordance-RVK-Verbundbibliothek/bulk.csv create mode 100644 Concordance-RVK-Verbundbibliothek/bulk.json create mode 100644 Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux diff --git a/Concordance-RVK-Verbundbibliothek/bulk.csv b/Concordance-RVK-Verbundbibliothek/bulk.csv new file mode 100644 index 0000000..4a10c81 --- /dev/null +++ b/Concordance-RVK-Verbundbibliothek/bulk.csv @@ -0,0 +1 @@ +"HT013166356, HT018625006, TT000577460","CI 5310","CI 5603","CI 1100","CI 1125","CI 5603","CI 5604","EC 2430","IH 34381" diff --git a/Concordance-RVK-Verbundbibliothek/bulk.json b/Concordance-RVK-Verbundbibliothek/bulk.json new file mode 100644 index 0000000..4a10c81 --- /dev/null +++ b/Concordance-RVK-Verbundbibliothek/bulk.json @@ -0,0 +1 @@ +"HT013166356, HT018625006, TT000577460","CI 5310","CI 5603","CI 1100","CI 1125","CI 5603","CI 5604","EC 2430","IH 34381" diff --git a/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux b/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux new file mode 100644 index 0000000..ca4c6de --- /dev/null +++ b/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux @@ -0,0 +1,27 @@ +// Die flux filtert mittels morph-cg-to-es.xml die Isil DE-605 aus +// This flux uses morph-cg-to-es.xml to filter records with holdings +// by Isil DE-605 from culturegraph aggregate marcxml. It then builds +// a concordance Id<->RVK which can be indexed directly into elasticsearch. +// Snippet from the output json: +// +//{"index":{"_index":"cgrvk","_type":"rvk"}} +//{"rvk":["CI 1100","5,1"],"hbzId":"HT018839495, HT018625006"} +// +// Use curl to bulk load the file: +// +// curl -XPOST --header 'Content-Type: application/x-ndjson' -d @bulk.ndjson 'http://localhost:9200/_bulk' + +default outfile = FLUX_DIR + "bulk.json"; +default infile = FLUX_DIR + "aggregate_auslieferung_20191212.small.marcxml.gz"; +default fixfile = FLUX_DIR + "fix-cg-to-es.fix"; + + +infile +| open-file +| decode-xml +| handle-marcxml +| fix(fixfile) +| encode-csv +//encode-json +| write(outfile) +; \ No newline at end of file diff --git a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix index 96607cd..c21b900 100644 --- a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix +++ b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix @@ -1,5 +1,4 @@ -add_field("id","") -set_array("@id") +set_array("id") set_array("rvk[]") do list(path: "084??", "var": "$i") @@ -8,14 +7,14 @@ do list(path: "084??", "var": "$i") end end + do list(path: "035??", "var": "$i") if any_match("$i.a", "^\\(DE-605\\)(.*)") - copy_field("$i.a","@id.$append") + copy_field("$i.a","id.$append") end end -replace_all("@id.*","^\\(DE-605\\)(.*)","$1") -join_field("@id",", ") -move_field("@id","id") +replace_all("id.*","^\\(DE-605\\)(.*)","$1") +join_field("id",", ") retain("rvk[]","id") vacuum() @@ -25,7 +24,7 @@ unless exists("rvk[]") reject() end -# Filter records without hbz @ids +# Filter records without hbz ids unless exists("id") reject() end From 06c1955c070c2b092c052cfbaff99e7a34201c9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20B=C3=BClte?= Date: Mon, 3 Jun 2024 13:46:40 +0200 Subject: [PATCH 5/6] Keep single records for every id Created objects that are kept in path: records with different ids but the same rvk elements, encoded them as json and reopened them with direction records as the record container. By that I am able to create a single record for each id. --- Concordance-RVK-Verbundbibliothek/bulk.csv | 4 ++- ...Rvk-Verbundbibliothek_concordance_csv.flux | 5 ++-- .../fix-cg-to-es.fix | 27 ++++++++++++------- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/Concordance-RVK-Verbundbibliothek/bulk.csv b/Concordance-RVK-Verbundbibliothek/bulk.csv index 4a10c81..7a107db 100644 --- a/Concordance-RVK-Verbundbibliothek/bulk.csv +++ b/Concordance-RVK-Verbundbibliothek/bulk.csv @@ -1 +1,3 @@ -"HT013166356, HT018625006, TT000577460","CI 5310","CI 5603","CI 1100","CI 1125","CI 5603","CI 5604","EC 2430","IH 34381" +"HT013166356","CI 5310","CI 5603","CI 1100","CI 1125","CI 5604","EC 2430","IH 34381" +"HT018625006","CI 5310","CI 5603","CI 1100","CI 1125","CI 5604","EC 2430","IH 34381" +"TT000577460","CI 5310","CI 5603","CI 1100","CI 1125","CI 5604","EC 2430","IH 34381" diff --git a/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux b/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux index ca4c6de..20d1cd2 100644 --- a/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux +++ b/Concordance-RVK-Verbundbibliothek/culturegraph_to_Rvk-Verbundbibliothek_concordance_csv.flux @@ -11,7 +11,7 @@ // // curl -XPOST --header 'Content-Type: application/x-ndjson' -d @bulk.ndjson 'http://localhost:9200/_bulk' -default outfile = FLUX_DIR + "bulk.json"; +default outfile = FLUX_DIR + "bulk.csv"; default infile = FLUX_DIR + "aggregate_auslieferung_20191212.small.marcxml.gz"; default fixfile = FLUX_DIR + "fix-cg-to-es.fix"; @@ -21,7 +21,8 @@ infile | decode-xml | handle-marcxml | fix(fixfile) +| encode-json +| decode-json(recordPath="records") | encode-csv -//encode-json | write(outfile) ; \ No newline at end of file diff --git a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix index c21b900..dc881db 100644 --- a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix +++ b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix @@ -1,34 +1,41 @@ -set_array("id") -set_array("rvk[]") +set_array("records[]") +set_array("@id[]") +set_array("@rvk[]") do list(path: "084??", "var": "$i") if any_match("$i.2", "rvk") - copy_field("$i.a","rvk[].$append") + copy_field("$i.a","@rvk[].$append") end end +uniq("@rvk[]") + do list(path: "035??", "var": "$i") if any_match("$i.a", "^\\(DE-605\\)(.*)") - copy_field("$i.a","id.$append") + copy_field("$i.a","@id[].$append") end end -replace_all("id.*","^\\(DE-605\\)(.*)","$1") -join_field("id",", ") +replace_all("id[].*","^\\(DE-605\\)(.*)","$1") + +do list(path: "@id[]", "var": "$i") + copy_field("$i","records[].$append.id") + copy_field("@rvk[]","records[].$last.rvk[]") +end +replace_all("records[].*.id","^\\(DE-605\\)(.*)","$1") -retain("rvk[]","id") vacuum() # Filter records without RVK -unless exists("rvk[]") +unless exists("@rvk[]") reject() end # Filter records without hbz ids -unless exists("id") +unless exists("@id[]") reject() end - +retain("records[]") From 6fa20b85390c93c48702ddb07200268a39ee2900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20B=C3=BClte?= Date: Mon, 3 Jun 2024 13:51:46 +0200 Subject: [PATCH 6/6] Join rvk array to one element --- Concordance-RVK-Verbundbibliothek/bulk.csv | 6 +++--- Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix | 11 ++++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/Concordance-RVK-Verbundbibliothek/bulk.csv b/Concordance-RVK-Verbundbibliothek/bulk.csv index 7a107db..cab0361 100644 --- a/Concordance-RVK-Verbundbibliothek/bulk.csv +++ b/Concordance-RVK-Verbundbibliothek/bulk.csv @@ -1,3 +1,3 @@ -"HT013166356","CI 5310","CI 5603","CI 1100","CI 1125","CI 5604","EC 2430","IH 34381" -"HT018625006","CI 5310","CI 5603","CI 1100","CI 1125","CI 5604","EC 2430","IH 34381" -"TT000577460","CI 5310","CI 5603","CI 1100","CI 1125","CI 5604","EC 2430","IH 34381" +"HT013166356","CI 5310,CI 5603,CI 1100,CI 1125,CI 5604,EC 2430,IH 34381" +"HT018625006","CI 5310,CI 5603,CI 1100,CI 1125,CI 5604,EC 2430,IH 34381" +"TT000577460","CI 5310,CI 5603,CI 1100,CI 1125,CI 5604,EC 2430,IH 34381" diff --git a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix index dc881db..9083556 100644 --- a/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix +++ b/Concordance-RVK-Verbundbibliothek/fix-cg-to-es.fix @@ -1,14 +1,15 @@ set_array("records[]") set_array("@id[]") -set_array("@rvk[]") +set_array("@rvk") do list(path: "084??", "var": "$i") if any_match("$i.2", "rvk") - copy_field("$i.a","@rvk[].$append") + copy_field("$i.a","@rvk.$append") end end -uniq("@rvk[]") +uniq("@rvk") +join_field("@rvk",",") do list(path: "035??", "var": "$i") @@ -20,14 +21,14 @@ replace_all("id[].*","^\\(DE-605\\)(.*)","$1") do list(path: "@id[]", "var": "$i") copy_field("$i","records[].$append.id") - copy_field("@rvk[]","records[].$last.rvk[]") + copy_field("@rvk","records[].$last.rvk") end replace_all("records[].*.id","^\\(DE-605\\)(.*)","$1") vacuum() # Filter records without RVK -unless exists("@rvk[]") +unless exists("@rvk") reject() end