diff --git a/pom.xml b/pom.xml
index 5c159e1..f7a3d64 100644
--- a/pom.xml
+++ b/pom.xml
@@ -18,6 +18,7 @@
UTF-8
+ UTF-8
1.8
1.8
@@ -31,7 +32,7 @@
org.junit.jupiter
junit-jupiter
- 5.7.0
+ 5.8.2
test
@@ -53,10 +54,9 @@
commons-cli
commons-cli
- 1.4
+ 1.5.0
compile
-
-
- org.acoli.fintan
- fintan-core
- 0.0.1-SNAPSHOT
-
org.apache.commons
commons-lang3
- 3.4
+ 3.12.0
com.fasterxml.jackson.core
jackson-databind
- 2.12.3
+ 2.13.1
uk.org.webcompere
@@ -91,18 +85,6 @@
-
-
-
- org.apache.logging.log4j
- log4j-bom
- 2.16.0
- import
- pom
-
-
-
-
@@ -125,6 +107,43 @@
single
+
+
+
+
+ org.jacoco
+ jacoco-maven-plugin
+ 0.8.7
+
+
+
+ prepare-agent
+
+
+
+ generate-code-coverage-report
+ test
+
+ report
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.0.0-M5
+
+
+ org.apache.maven.plugins
+ maven-failsafe-plugin
+ 3.0.0-M5
+
+
+
+ integration-test
+ verify
+
diff --git a/src/main/java/org/acoli/conll/rdf/ANSI.java b/src/main/java/org/acoli/conll/rdf/ANSI.java
new file mode 100644
index 0000000..7ef28e0
--- /dev/null
+++ b/src/main/java/org/acoli/conll/rdf/ANSI.java
@@ -0,0 +1,26 @@
+package org.acoli.conll.rdf;
+
+public enum ANSI {
+ ;
+
+ public static final String RESET = "\u001B[0m";
+ public static final String BRIGHTER = "\u001B[1m";
+ public static final String ULINE = "\u001B[4m";
+ public static final String FLASH = "\u001B[5m";
+ public static final String BLACK = "\u001B[30m";
+ public static final String RED = "\u001B[31m";
+ public static final String GREEN = "\u001B[32m";
+ public static final String YELLOW = "\u001B[33m";
+ public static final String BLUE = "\u001B[34m";
+ public static final String PURPLE = "\u001B[35m";
+ public static final String CYAN = "\u001B[36m";
+ public static final String WHITE = "\u001B[37m";
+ public static final String BLACK_BK = "\u001B[40m";
+ public static final String RED_BK = "\u001B[41m";
+ public static final String GREEN_BK = "\u001B[42m";
+ public static final String YLW_BK = "\u001B[43m";
+ public static final String BLUE_BK = "\u001B[44m";
+ public static final String PPL_BK = "\u001B[45m";
+ public static final String CYAN_BK = "\u001B[46m";
+ public static final String WHITE_BK = "\u001B[47m";
+}
diff --git a/src/main/java/org/acoli/conll/rdf/CoNLLRDFAnnotator.java b/src/main/java/org/acoli/conll/rdf/CoNLLRDFAnnotator.java
index 4bb3a7b..31e8d00 100644
--- a/src/main/java/org/acoli/conll/rdf/CoNLLRDFAnnotator.java
+++ b/src/main/java/org/acoli/conll/rdf/CoNLLRDFAnnotator.java
@@ -67,19 +67,19 @@ public static void main(String[] args) throws IOException {
while(!command.trim().equals(">")) {
System.err.print(
"actions ............................................................................................................\n"+
- " : "+ANSI_BLUE+"$nr/$att=$val"+ANSI_RESET+" for element number $nr, set CoNLL property $att to $val, e.g., \"1/POS=NOUN\" :\n"+
+ " : "+ANSI.BLUE+"$nr/$att=$val"+ANSI.RESET+" for element number $nr, set CoNLL property $att to $val, e.g., \"1/POS=NOUN\" :\n"+
//" : $nr element number (starting with 1), e.g., 1 for the first :\n"+
//" : $att local name of a CoNLL property, e.g., POS :\n"+
//" : $val string value of the CoNLL property, e.g., NOUN :\n"+
" : for HEAD, enter the number of the head node, will be expanded to URI :\n"+
- " : "+ANSI_BLUE+"$nr/$p1[/$p2..]"+ANSI_RESET+" multiple $att=$val patterns $p1, $p2, ... for $nr can be provided as ,-separated list :\n"+
+ " : "+ANSI.BLUE+"$nr/$p1[/$p2..]"+ANSI.RESET+" multiple $att=$val patterns $p1, $p2, ... for $nr can be provided as ,-separated list :\n"+
" : e.g., \"1/HEAD=0/EDGE=root\"; NOTE: $val must not contain / :\n"+
- " : "+ANSI_BLUE+">"+ANSI_RESET+" write and go to next sentence :\n"+
- " : "+ANSI_BLUE+"m"+ANSI_RESET+" define or undefine a macro (a regex for preprocessing your input) :\n"+
- " : "+ANSI_BLUE+"+C"+ANSI_RESET+" quit :\n"+
+ " : "+ANSI.BLUE+">"+ANSI.RESET+" write and go to next sentence :\n"+
+ " : "+ANSI.BLUE+"m"+ANSI.RESET+" define or undefine a macro (a regex for preprocessing your input) :\n"+
+ " : "+ANSI.BLUE+"+C"+ANSI.RESET+" quit :\n"+
" :..........................................................................................................:\n");
if(macros.trim().length()>0)
- System.err.println("macros "+ANSI_RED+macros.replaceAll("\n",ANSI_RESET+"\n "+ANSI_RED).replaceAll("\t","\t"+ANSI_RESET+"=>\t"+ANSI_BLUE)+ANSI_RESET);
+ System.err.println("macros "+ANSI.RED+macros.replaceAll("\n",ANSI.RESET+"\n "+ANSI.RED).replaceAll("\t","\t"+ANSI.RESET+"=>\t"+ANSI.BLUE)+ANSI.RESET);
System.err.print("| ----------------------------\n| "+CoNLLRDFFormatter.extractCoNLLGraph(buffer,true).replaceAll("\n","\n| ")+"-----------------------------\n"+
"command: ");
command=commands.readLine().trim();
@@ -107,7 +107,7 @@ public static void main(String[] args) throws IOException {
}
command = "";
}
- //System.err.println(ANSI_RED+"> "+line+ANSI_RESET);
+ //System.err.println(ANSI.RED+"> "+line+ANSI.RESET);
if(line.trim().startsWith("@") && !lastLine.trim().endsWith("."))
//System.out.print("\n");
buffer=buffer+"\n";
@@ -140,7 +140,7 @@ protected static String applyMacros(String macros, String cmd) {
cmd=cmd.replaceAll(lhs,rhs);
}
if(!cmd.equals(orig))
- System.err.println("macro expansion: "+ANSI_RED+orig+ANSI_RESET+"\t=>\t"+ANSI_BLUE+cmd+ANSI_RESET);
+ System.err.println("macro expansion: "+ANSI.RED+orig+ANSI.RESET+"\t=>\t"+ANSI.BLUE+cmd+ANSI.RESET);
return cmd;
}
diff --git a/src/main/java/org/acoli/conll/rdf/CoNLLRDFFormatter.java b/src/main/java/org/acoli/conll/rdf/CoNLLRDFFormatter.java
index 93eba59..4210e9f 100644
--- a/src/main/java/org/acoli/conll/rdf/CoNLLRDFFormatter.java
+++ b/src/main/java/org/acoli/conll/rdf/CoNLLRDFFormatter.java
@@ -1,803 +1,720 @@
-/*
- * Copyright [2017] [ACoLi Lab, Prof. Dr. Chiarcos, Goethe University Frankfurt]
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.acoli.conll.rdf;
-
-import java.io.*;
-import java.util.*;
-import org.apache.jena.rdf.model.*; // Jena 2.x
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-import org.apache.commons.cli.ParseException;
-import org.apache.jena.query.*;
-
-
-/** reads CoNLL-RDF from stdin, writes it formatted to stdout (requires a Un*x shell)
- * this is basically for diagnostic purposes
- * @author Christian Chiarcos {@literal chiarcos@informatik.uni-frankfurt.de}
- * @author Christian Faeth {@literal faeth@em.uni-frankfurt.de}
- */
-public class CoNLLRDFFormatter extends CoNLLRDFComponent {
-
- protected static Logger LOG = LogManager.getLogger(CoNLLRDFFormatter.class.getName());
-
- public static final String ANSI_RESET = "\u001B[0m";
- public static final String ANSI_BRIGHTER = "\u001B[1m";
- public static final String ANSI_ULINE = "\u001B[4m";
- public static final String ANSI_FLASH = "\u001B[5m";
- public static final String ANSI_BLACK = "\u001B[30m";
- public static final String ANSI_RED = "\u001B[31m";
- public static final String ANSI_GREEN = "\u001B[32m";
- public static final String ANSI_YELLOW = "\u001B[33m";
- public static final String ANSI_BLUE = "\u001B[34m";
- public static final String ANSI_PURPLE = "\u001B[35m";
- public static final String ANSI_CYAN = "\u001B[36m";
- public static final String ANSI_WHITE = "\u001B[37m";
- public static final String ANSI_BLACK_BK = "\u001B[40m";
- public static final String ANSI_RED_BK = "\u001B[41m";
- public static final String ANSI_GREEN_BK = "\u001B[42m";
- public static final String ANSI_YLW_BK = "\u001B[43m";
- public static final String ANSI_BLUE_BK = "\u001B[44m";
- public static final String ANSI_PPL_BK = "\u001B[45m";
- public static final String ANSI_CYAN_BK = "\u001B[46m";
- public static final String ANSI_WHITE_BK = "\u001B[47m";
-
- public class Module {
- private Mode mode = Mode.CONLLRDF;
- private List cols = new ArrayList();
- String select = "";
- private PrintStream outputStream;
-
- public Mode getMode() {
- return mode;
- }
-
- public void setMode(Mode mode) {
- this.mode = mode;
- }
-
- public List getCols() {
- return cols;
- }
-
- public void setCols(List cols) {
- this.cols = cols;
- }
-
- public String getSelect() {
- return select;
- }
-
- public void setSelect(String select) {
- this.select = select;
- }
-
- public PrintStream getOutputStream() {
- if (outputStream != null) {
- return outputStream;
- } else {
- // Retrieve outputStream of the enclosing Formatter
- return new PrintStream(CoNLLRDFFormatter.this.getOutputStream());
- }
- }
-
- public void setOutputStream(PrintStream outputStream) {
- this.outputStream = outputStream;
- }
- }
-
- public static enum Mode {
- CONLL, CONLLRDF, DEBUG, QUERY, GRAMMAR, SEMANTICS, GRAMMAR_SEMANTICS
- }
-
- private List modules = new ArrayList();
-
- public List getModules() {
- return modules;
- }
- public Module addModule(Mode mode) {
- Module module = new Module();
- module.setMode(mode);
- modules.add(module);
- return module;
- }
-
- /** do some highlighting, but provide the full TTL data*/
- public String colorTTL(String buffer) {
- return buffer.replaceAll("(terms:[^ ]*)",ANSI_YLW_BK+"$1"+ANSI_RESET)
- .replaceAll("(rdfs:label +)(\"[^\"]*\")","$1"+ANSI_CYAN+"$2"+ANSI_RESET)
- .replaceAll("(nif:[^ ]*)",ANSI_YELLOW+"$1"+ANSI_RESET)
- .replaceAll("(conll:[^ \n]*)([^;\n]*[;]?)",ANSI_CYAN_BK+ANSI_BRIGHTER+ANSI_BLUE+"$1"+ANSI_RESET+ANSI_CYAN_BK+ANSI_BRIGHTER+"$2"+ANSI_RESET);
- }
-
- /** default: do not return type assignments */
- protected static String extractCoNLLGraph(String buffer) {
- return extractCoNLLGraph(buffer,false);
- }
-
- /** buffer must be valid turtle, produces an extra column for terms: type assignments */
- protected static String extractCoNLLGraph(String buffer, boolean includeTermConcepts) {
- Model m = null;
- try {
- m = ModelFactory.createDefaultModel().read(new StringReader(buffer),null, "TTL");
- } catch (org.apache.jena.riot.RiotException e) {
- e.printStackTrace();
- LOG.error("while reading:\n"+buffer);
- }
- Vector ids = new Vector();
- Vector words = new Vector();
- Vector annos = new Vector();
- Vector depth = new Vector();
- Vector edges = new Vector();
- Vector headDir = new Vector();
- Vector terms = new Vector();
- Integer maxDepth = 0;
- Integer maxEdgeLength = 0;
- Integer maxIdLength = 0;
- Integer maxWordLength = 0;
- Integer maxTermLength = 0;
-
- String word = null;
- try {
- word = QueryExecutionFactory.create(
- "PREFIX nif: \n"+
- "SELECT ?first WHERE { ?first a nif:Word. FILTER(NOT EXISTS{ [] nif:nextWord ?first })} LIMIT 1",
- m).execSelect().next().get("?first").toString();
- while(true) {
- ids.add(word.replaceAll(".*[\\\\/#:]", ""));
- maxIdLength=Math.max(maxIdLength, ids.get(ids.size()-1).length());
- try {
- words.add(
- QueryExecutionFactory.create(
- "PREFIX conll: \n"+
- "SELECT ?word WHERE { <"+word+"> conll:WORD ?word } LIMIT 1",
- m).execSelect().next().get("?word").toString());
- } catch (NoSuchElementException e) {
- LOG.warn("Warning: no conll:WORD (WORD column) found");
- words.add("");
- }
- maxWordLength=Math.max(maxWordLength, words.get(words.size()-1).length());
- String anno = "";
- ResultSet annos_raw = QueryExecutionFactory.create(
- "PREFIX conll: \n"+
- "SELECT ?rel ?val WHERE { <"+word+"> ?rel ?val \n"
- + "FILTER(contains(str(?rel),'http://ufal.mff.cuni.cz/conll2009-st/task-description.html#'))\n"
- + "FILTER(?rel!=conll:HEAD && ?rel!=conll:EDGE && ?rel!=conll:WORD) } ORDER BY ASC(?rel)",
- m).execSelect();
- String rel = "";
- while(annos_raw.hasNext()) {
- QuerySolution next = annos_raw.next();
- String nextRel = next.get("?rel").toString().replaceFirst(".*#","");
- if(!rel.equals(nextRel))
- anno=anno+
- ANSI_BLUE+ANSI_ULINE+
- nextRel+
- ANSI_RESET+" ";
- rel=nextRel;
- anno=anno+
- next.get("?val").toString().
- replaceFirst("^http://purl.org/acoli/open-ie/(.*)$",ANSI_YLW_BK+"$1"+ANSI_RESET).
- replaceFirst(".*#","")+
- " ";
- }
-
- // we append OLiA annotations to CoNLL annotations
- ResultSet olia_types= QueryExecutionFactory.create(
- "PREFIX conll: \n"+
- "SELECT ?concept WHERE { <"+word+"> a ?concept \n"
- + "FILTER(contains(str(?concept),'http://purl.org/olia'))\n"
- + "} ORDER BY ASC(?val)",
- m).execSelect();
- while(olia_types.hasNext())
- anno=anno+
- ANSI_RED+
- olia_types.next().get("?concept").toString().replaceFirst("^.*/([^/]*)\\.(owl|rdf)[#/]","$1:")+
- ANSI_RESET+" ";
-
- // append OLiA features
- ResultSet olia_feats= QueryExecutionFactory.create(
- "PREFIX conll: \n"+
- "SELECT ?rel ?concept WHERE { <"+word+"> ?rel ?val. ?val a ?concept.\n"
- + "FILTER(contains(str(?rel),'http://purl.org/olia'))\n"
- + "FILTER(contains(str(?concept),'http://purl.org/olia'))\n"
- + "} ORDER BY ASC(?rel)",
- m).execSelect();
- while(olia_feats.hasNext()) {
- QuerySolution next = olia_feats.next();
- anno = anno+
- ANSI_RED+ANSI_ULINE+
- next.get("?rel").toString().replaceFirst("^.*/([^/]*)\\.(owl|rdf)[#/]","$1:")+
- ANSI_RESET+"."+ANSI_RED+
- next.get("?concept").toString().replaceFirst("^.*/([^/]*)\\.(owl|rdf)[#/]","$1:")+
- ANSI_RESET+" ";
- }
-
- annos.add(anno);
-
- String head = "";
- try {
- head =
- QueryExecutionFactory.create(
- "PREFIX conll: \n"+
- "SELECT ?head WHERE { <"+word+"> conll:HEAD ?head} LIMIT 1",
- m).execSelect().next().get("?head").toString();
- if(Integer.parseInt(head.replaceAll("[^0-9]","")) < Integer.parseInt(word.replaceAll("[^0-9]","")))
- headDir.add(" \\ ");
- else
- headDir.add(" / ");
- } catch (NumberFormatException e) {
- e.printStackTrace();
- if(head.compareTo(word)<1) headDir.add(" \\ "); else headDir.add(" / ");
- } catch (NoSuchElementException e) {
- headDir.add(" ");
- }
-
- try {
- depth.add(
- Integer.parseInt(QueryExecutionFactory.create(
- "PREFIX conll: \n"+
- "SELECT (COUNT(DISTINCT ?head) AS ?depth) WHERE { <"+word+"> conll:HEAD+ ?head }",
- m).execSelect().next().get("?depth").toString().replaceFirst("^\"?([0-9]+)[\\^\"].*","$1")));
- } catch(NoSuchElementException e) {
- if(depth.size()==0) depth.add(1);
- else depth.add(depth.get(depth.size()-1));
- }
- maxDepth=Math.max(maxDepth, depth.get(depth.size()-1));
-
-
- try { // return the longest edge
- edges.add(
- QueryExecutionFactory.create(
- "PREFIX conll: \n"+
- "PREFIX fn: \n"+
- "SELECT ?edge ?length WHERE { <"+word+"> conll:EDGE ?edge. BIND(fn:string-length(?edge) AS ?length) } ORDER BY DESC(?length) LIMIT 1",
- m).execSelect().next().get("?edge").toString());
- } catch(NoSuchElementException e) {
- edges.add("");
- }
- maxEdgeLength=Math.max(maxEdgeLength,edges.get(edges.size()-1).length());
-
- String term = "";
- if(includeTermConcepts) {
- ResultSet terms_raw = QueryExecutionFactory.create(
- "PREFIX conll: \n"+
- "SELECT ?term WHERE { <"+word+"> a ?term \n"
- + "FILTER(contains(str(?term),'http://purl.org/acoli/open-ie/'))\n"
- + " } ORDER BY ASC(?term)",
- m).execSelect();
- while(terms_raw.hasNext())
- term=term+terms_raw.next().get("?term").toString().
- replaceFirst("http://purl.org/acoli/open-ie/","")+" ";
- //replaceFirst("http://purl.org/acoli/open-ie/","terms:")+" ";
- }
- terms.add(term.trim());
- maxTermLength=Math.max(maxTermLength, term.trim().length());
-
- word = QueryExecutionFactory.create(
- "PREFIX nif: \n"+
- "SELECT ?next WHERE { <"+word+"> nif:nextWord ?next } LIMIT 1",
- m).execSelect().next().get("?next").toString();
- }
- } catch (NoSuchElementException e) {
- } catch(Exception e) {
- e.printStackTrace();
- }
-
- String result = "";
-
-
- for(int i = 0; i0;j--)
- result=result+" .";
- result=result+ANSI_RESET;
- result=result+headDir.get(i);
- result=result+edges.get(i);
- for(int j = maxDepth-depth.get(i);j>0;j--)
- if(depth.get(i)>1) result=result+"--"; else result=result+" ";
- for(int j = edges.get(i).length();j1) result=result+"-"; else result=result+" ";
- result=result+" "+words.get(i);
- for(int j = words.get(i).length(); j\n"
- + "PREFIX conll: \n"
- + "SELECT ?w ?word (COUNT(DISTINCT ?pre) AS ?pos)\n"
- + "WHERE {\n"
- + "?w conll:WORD ?word.\n"
- + "?pre nif:nextWord* ?w.\n"
- + "} GROUP BY ?w ?word ORDER BY ASC(?pos)",m).execSelect();
- while(sentence.hasNext())
- result=result+sentence.next().get("?word")+" ";
-
- // write result set
- ResultSet semgraph = QueryExecutionFactory.create(
- "PREFIX rdfs: \n"
- +"PREFIX xsd: \n"
- +"SELECT DISTINCT ?s ?sl ?r ?o ?ol ?in ?out\n"
- +"WHERE { "
- + "?s ?r [].\n"
- + "OPTIONAL { ?s ?r ?o }. \n" // ?o can be blank
- + "FILTER(contains(concat(str(?r),str(?o)),'http://purl.org/acoli/open-ie/') &&\n"
- + " !contains(str(?r),'http://ufal.mff.cuni.cz/conll2009-st/task-description.html#'))\n"
- + "OPTIONAL {?s rdfs:label ?sl }\n"
- + "OPTIONAL {?o rdfs:label ?ol }\n"
- + "BIND(xsd:integer(REPLACE(STR(?s),'[^0-9]','')) AS ?snr)\n"
- + "BIND(xsd:integer(REPLACE(STR(?o),'[^0-9]','')) AS ?onr)\n"
- + "{ FILTER(!BOUND(?snr)) BIND(?snr AS ?nr) } UNION"
- + "{ FILTER(BOUND(?snr)) BIND(?onr AS ?nr) } \n"
- + "OPTIONAL { SELECT ?s (COUNT(DISTINCT *) AS ?in)\n"
- + " WHERE { ?sin ?rin ?s FILTER(!ISBLANK(?sin)) FILTER(contains(str(?rin),'http://purl.org/acoli/open-ie/')) } GROUP BY ?s \n"
- + "}"
- + "OPTIONAL { SELECT ?s (COUNT(DISTINCT *) AS ?out)\n"
- + " WHERE { ?s ?rout ?sout FILTER(!ISBLANK(?sout)) FILTER(contains(str(?rout),'http://purl.org/acoli/open-ie/'))} GROUP BY ?s \n"
- + "}"
- + "}"
- + "ORDER BY ASC(?nr) ASC(?snr) ASC(?onr) ?r ?s ?o",
- m).execSelect();
- while(semgraph.hasNext()) {
- QuerySolution next = semgraph.next();
- RDFNode sNode = next.get("?s");
- String nextS = sNode.toString().replaceAll(".*[#/]","");
- if(!sNode.isURIResource()) nextS="[]";
- if(next.get("?sl")!=null) nextS=nextS+" "+ANSI_CYAN+"\""+next.get("?sl")+"\""+ANSI_RESET;
- if(!nextS.equals(s)) {
- result=result+"\n"+nextS+" ("+
- ("0"+next.get("?in")).replaceFirst("[^0-9].*","").replaceFirst("^0*([^0])","$1")+" > node > "+
- ("0"+next.get("?out")).toString().replaceFirst("[^0-9].*","").replaceFirst("^0*([^0])","$1")+")";
- }
- String nextR = next.get("?r").toString()
- .replaceAll("http://ufal.mff.cuni.cz/conll2009-st/task-description.html#(.*)$",ANSI_BLUE+ANSI_ULINE+"$1"+ANSI_RESET)
- .replaceAll("http://purl.org/acoli/open-ie/(.*)",ANSI_YLW_BK+"terms:$1"+ANSI_RESET)
- .replaceAll("http://www.w3.org/1999/02/22-rdf-syntax-ns#type","a");
-
- String nextO = next.get("?o").toString()
- .replaceAll("http://purl.org/acoli/open-ie/(.*)",ANSI_YLW_BK+"terms:$1"+ANSI_RESET)
- .replaceAll("[^ \t]*[#/]","");
- if(next.get("?ol")!=null)
- nextO=nextO+" "+ANSI_CYAN+"\""+next.get("?ol")+"\""+ANSI_RESET;
-
- if(!nextR.equals("a") || includeTermConcepts==true) {
- if(!nextS.equals(s) || !nextR.equals(r))
- result=result+"\n\t"+nextR;
- else if(!nextO.equals(o)) result=result+"; ";
- if(!nextS.equals(s) || !nextR.equals(r) || !nextO.equals(o)) {
- result=result+" "+nextO;
- }
- }
- s=nextS;
- r=nextR;
- o=nextO;
- }
- } catch (NoSuchElementException e) {
- } catch (Exception e) {
- e.printStackTrace();
- }
- return result+"\n";
- }
-
- /** require that every line starts with a subject, sort: @ (prefix) & # (comment) > lines, lines sorted lexiconumerically, i.e., normalize length of integers (regardless of position) before sorting */
- protected static String reorderTTLBuffer(String buffer, List cols) {
- String result ="";
- try {
- BufferedReader in = new BufferedReader(new StringReader(buffer));
- Hashtable key2line = new Hashtable();
- String line;
- while((line=in.readLine())!=null) {
- line=line.trim();
- if(line.startsWith("@")) result=result+line+"\n"; else
- if(line.startsWith("#")) result=result+line+"\n"; else
- if(!line.equals("")) {
- //reorder columns according to user list.
- String orderedLine = "";
- List statements = new ArrayList(Arrays.asList(line.substring(0, line.lastIndexOf(".")-1).split(";\\s*\t"))); //TODO: only consider ; not ";"
- List columns = new ArrayList();
- // Subject is always first. Change if complications occur.
- if (statements.get(0).contains("nif:Word")) {
- //do rdf:type reorder
- List concepts = new ArrayList(Arrays.asList(statements.get(0).split(",")));
- String[] subject = concepts.get(0).split("\\sa\\s");
- if (subject.length == 2) {
- orderedLine += subject[0] + " a nif:Word";
- if (!subject[1].contains("nif:Word")) {
- concepts.set(0, subject[1]);
- } else {
- concepts.remove(0);
- }
- } else {
- orderedLine += concepts.get(0);
- concepts.remove(0);
- }
- for (String concept:concepts) {
- if (concept.contains("nif:Word")) continue;
- orderedLine += ", " + concept.trim();
- }
- } else {
- orderedLine = statements.get(0).trim();
- }
- statements.remove(0);
- //do column reorder
- columns.add("nif:Word");
- columns.add("conll:WORD");
- columns.addAll(cols);
- for (String col:columns) {
- for (int i = 0; i < statements.size();i++) {
- if (statements.get(i).contains(col)) {
- orderedLine += "; " + statements.get(i).trim();
- statements.remove(i);
- break;
- }
- }
- }
- //add rest of columns to the end
- String nifnext = "";
- for (int i = 0; i < statements.size();i++) {
- if (statements.get(i).contains("nif:nextWord"))
- nifnext = "; " + statements.get(i).trim();
- else
- orderedLine += "; " + statements.get(i).trim();
- }
- if (!orderedLine.equals("")) {
- orderedLine += nifnext + " .";
- line = orderedLine;
- }
-
-
- //reorder lines
- String tmp=line.replaceAll("\t"," ").replaceAll("([^0-9])([0-9])","$1\t$2").replaceAll("([0-9])([^0-9])","$1\t$2"); // key \t-split
- String key="";
- for(String s : tmp.split("\t")) {
- if(s.matches("^[0-9]+$"))
- while(s.length()<64) s="0"+s;
- key=key+s;
- }
- key2line.put(key,line);
- }
- }
- List keys = new ArrayList(key2line.keySet());
- Collections.sort(keys);
- for(String key: keys)
- result=result+key2line.get(key)+"\n";
- } catch (IOException e) {
- e.printStackTrace();
- }
- return result;
- }
-
- /** note: the last column must contain literal values, not HEAD */
- public static String columnsAsSelect(List cols) {
- String select = ""
- + "PREFIX nif: \n"
- + "PREFIX rdfs: \n"
- + "PREFIX conll: \n"
- + "PREFIX xsd: \n"
-
- + "SELECT ";
- for (String col:cols) {
- select += "?"+col+" ";
- }
-
- select += "{\n";
- select += " SELECT \n";
- select += " ?sid ?wid \n";
-
- for (String col:cols) {
- select += " (group_concat(?"+col+"s;separator='|') as ?"+col+")\n";
- }
-
- String lastCol = cols.get(cols.size()-1);
-
- select += " WHERE {\n";
- select += " ?word a nif:Word .\n";
- select += " {\n";
- select += " SELECT ?word (count(distinct ?preS) as ?sid) (count(distinct ?pre) as ?wid)\n";
- select += " WHERE {\n";
- select += " ?word a nif:Word .\n";
- select += " ?pre nif:nextWord* ?word .\n";
- select += " ?word conll:HEAD+ ?s. ?s a nif:Sentence. ?preS nif:nextSentence* ?s.\n";
- select += " }\n";
- select += " group by ?word\n";
- select += " }\n";
- for (String col:cols) {
- if(col.equals(lastCol)) { // cast to string
- if (col.equals("HEAD")) { //TODO: streamline! only difference to statement below is binding to HEADa instead of HEADs
- select += " OPTIONAL {\n";
- select += " ?word conll:HEAD ?headurl .\n";
- select += " bind(replace(str(?headurl), '^.*s[0-9]+_([0-9]+)$', '$1') as ?HEADa) .\n";
- select += " } .\n";
- } else {
- select += " OPTIONAL{?word conll:"+col+" ?"+col+"_raw .";
- select += " BIND(str(?"+col+"_raw) as ?"+col+"a)} .\n";
- }
- select += " BIND(concat(if(bound(?"+col+"a),?"+col+"a,'_'),\n";
- select += " IF(EXISTS { ?word nif:nextWord [] }, '', '\\n')) as ?"+col+"s)\n";
- // we append a linebreak to the value of the last column to generate sentence breaks within a local graph
- } else if (col.equals("HEAD")) {
- select += " OPTIONAL {\n";
- select += " ?word conll:HEAD ?headurl .\n";
- select += " bind(replace(str(?headurl), '^.*s[0-9]+_([0-9]+)$', '$1') as ?HEADs) .\n";
- select += " } .\n";
- } else {
- select += " OPTIONAL{?word conll:"+col+" ?"+col+"_raw .";
- select += " BIND(str(?"+col+"_raw) as ?"+col+"s)} .\n"; // cast to string
- }
- }
- select += " }\n";
- select += " group by ?word ?sid ?wid\n";
- select += " order by ?sid ?wid\n";
- select += "}\n";
-
- return select;
- }
-
- /**
- * FOR LEO: please move whereever you like
- * @param m
- * CoNLL-RDF sentence as Model
- * @return
- * String[0]: all comments + \n
- * String[1]: model as Turtle (unsorted)
- * concatenate: Full CoNLL-RDF output
- */
- public static String[] conllRdfModel2String(Model m) {
- String[] out = new String[2];
-
- //generate comments in out[0]
- out[0] = new String();
- String selectComments = "PREFIX nif: \n"
- + "PREFIX rdfs: \n"
- + "SELECT ?c WHERE {?x a nif:Sentence . ?x rdfs:comment ?c}";
- QueryExecution qexec = QueryExecutionFactory.create(selectComments, m);
- ResultSet results = qexec.execSelect();
- while (results.hasNext()) {
- //please check the regex. Should put a # in front of every line, which does not already start with #.
- out[0] += results.next().getLiteral("c").toString().replaceAll("^([^#])", "#\1")+"\n";
- }
-
- //generate CoNLL-RDF Turtle (unsorted) in out[1]
- StringWriter modelOut = new StringWriter();
- m.write(modelOut, "TTL");
- out[1] = modelOut.toString();
- return out;
- }
-
- /** run either SELECT statement (cf. https://jena.apache.org/documentation/query/app_api.html) and return CoNLL-like TSV or just TTL
- * Note: this CoNLL-like export has limitations, of course: it will export one property per column, hence, collapsed dependencies or
- * SRL annotations cannot be reconverted */
- public static void printSparql(String buffer, String select, Writer out) throws IOException {
- Model m = ModelFactory.createDefaultModel().read(new StringReader(buffer),null, "TTL");
- String selectComments = "PREFIX nif: \n"
- + "PREFIX rdfs: \n"
- + "SELECT ?c WHERE {?x a nif:Sentence . ?x rdfs:comment ?c}";
- QueryExecution qexec = QueryExecutionFactory.create(selectComments, m);
- ResultSet results = qexec.execSelect();
- Set comments = new HashSet<>();
- boolean hasGlobalComments = false;
- while (results.hasNext()) {
- for (String result : results.next().getLiteral("c").toString().split("\\\\n")) {
- if (result.trim().matches("^\\s?global\\.columns\\s?=.*") )
- hasGlobalComments = true;
- else
- comments.add(result);
- }
- }
- qexec = QueryExecutionFactory.create(select, m);
- results = qexec.execSelect();
- List cols = results.getResultVars();
- BufferedReader in = new BufferedReader(new StringReader(buffer));
- Hashtable key2line = new Hashtable();
- String line;
- while((line=in.readLine())!=null) {
- if (line.trim().startsWith("#")) {
- for (String splitComment : line.split("\t")) {
- if (splitComment.trim().matches("^#\\s?global\\.columns\\s?=.*"))
- hasGlobalComments = true;
- else
- comments.add(splitComment.replace("#",""));
- }
- }
-
- }
- if (hasGlobalComments)
- out.write("# global.columns = " + String.join(" ", cols) + "\n");
- else {
- out.write("# global.columns = "+String.join(" ", cols)+"\n");
- }
- for (String comment : comments) {
- out.write("#"+comment+"\n");
- }
-
- while(results.hasNext()) {
- QuerySolution sol = results.next();
- for(String col : cols)
- if(sol.get(col)==null) out.write("_\t"); // CoNLL practice
- else out.write(sol.get(col)+"\t");
- out.write("\n");
- out.flush();
- }
- out.write("\n");
- out.flush();
- }
-
-
- /**
- * Searches a string buffer that is expected to represent a sentence for any
- * rdfs:comment
properties and checks them for a CoNLL-U Plus like global.columns comments.
- * Defaults to an empty columnNames Array if not present.
- * @param buffer a string buffer representing a sentence in conll-rdf
- * @return ArrayList of column names, empty if not present.
- */
- private List findColumnNamesInRDFBuffer(String buffer) {
- List columnNames = new ArrayList<>();
- Model m = ModelFactory.createDefaultModel().read(new StringReader(buffer),null, "TTL");
- String selectComments = "PREFIX nif: \n"
- + "PREFIX rdfs: \n"
- + "SELECT ?c WHERE {?x a nif:Sentence . ?x rdfs:comment ?c}";
- QueryExecution qexec = QueryExecutionFactory.create(selectComments, m);
- ResultSet results = qexec.execSelect();
- while (results.hasNext()) {
- String[] comments = results.next().getLiteral("c").toString().split("\\\\n");
- for (String comment : comments) {
- if (comment.matches("^\\s?global\\.columns\\s?=.*")) {
- columnNames.addAll(Arrays.asList(comment.trim()
- .replaceFirst("\\s?global\\.columns\\s?=", "")
- .trim().split(" |\t")));
- LOG.info("Found global columns comment in rdfs:comment");
- return columnNames;
- }
- }
- }
- return columnNames;
- }
-
- @Override
- protected void processSentenceStream() throws IOException {
- String line;
- String lastLine ="";
- String buffer="";
- BufferedReader in = new BufferedReader(new InputStreamReader(getInputStream()));
- while((line = in.readLine())!=null) {
- line=line.replaceAll("[\t ]+"," ").trim();
-
- if(!buffer.trim().equals(""))
- if((line.startsWith("@") || line.startsWith("#")) && !lastLine.startsWith("@") && !lastLine.startsWith("#")) { //!buffer.matches("@[^\n]*\n?$")) {
- for (Module m:modules) {
- if(m.getMode()==Mode.CONLLRDF) m.getOutputStream().println(reorderTTLBuffer(buffer, m.getCols()));
- if(m.getMode()==Mode.DEBUG) System.err.println(colorTTL(reorderTTLBuffer(buffer, m.getCols())));
- if(m.getMode()==Mode.CONLL) {
- if (m.getCols().size() < 1) {// no column args supplied
- LOG.info("No column names in cmd args, searching rdf comments..");
- List conllColumns = findColumnNamesInRDFBuffer(buffer);
- if (conllColumns.size()>0) {
- LOG.info("Using #global.comments from rdf");
- m.setCols(conllColumns);
- } else {
- LOG.info("Trying conll columns now..");
- conllColumns = CoNLLStreamExtractor.findFieldsFromComments(new BufferedReader(new StringReader(buffer.trim())), 1);
- if (conllColumns.size()>0) {
- m.setCols(conllColumns);
- }
- }
- }
- if (m.getCols().size() < 1) {
- LOG.info("Supply column names some way! (-conll arg, global.columns or rdf comments");
- }
- else
- printSparql(buffer, columnsAsSelect(m.getCols()), new OutputStreamWriter(m.getOutputStream()));
- }
- if(m.getMode()==Mode.QUERY) printSparql(buffer, m.getSelect(), new OutputStreamWriter(m.getOutputStream()));
- if(m.getMode()==Mode.GRAMMAR) m.getOutputStream().println(extractCoNLLGraph(buffer,true));
- if(m.getMode()==Mode.SEMANTICS) m.getOutputStream().println(extractTermGraph(buffer,true));
- if(m.getMode()==Mode.GRAMMAR_SEMANTICS) {
- m.getOutputStream().println(extractCoNLLGraph(buffer,true));
- m.getOutputStream().println(extractTermGraph(buffer,false));
- }
- }
- buffer="";
- }
- //System.err.println(ANSI_RED+"> "+line+ANSI_RESET);
- if(line.trim().startsWith("@") && !lastLine.trim().endsWith("."))
- //System.out.print("\n");
- buffer=buffer+"\n";
-
- if(line.trim().startsWith("#") && (!lastLine.trim().startsWith("#")))
- // System.out.print("\n");
- buffer=buffer+"\n";
-
- //System.out.print(" "+color(line));
- //System.out.print(color(line));
- buffer=buffer+line+"\t";//+"\n";
-
- if(line.trim().endsWith(".") || line.trim().matches("^(.*>)?[^<]*#"))
- //System.out.print("\n");
- buffer=buffer+"\n";
-
- //System.out.println();
- lastLine=line;
- }
-
- for (Module m:modules) {
- if(m.getMode()==Mode.CONLLRDF) m.getOutputStream().println(reorderTTLBuffer(buffer, m.getCols()));
- if(m.getMode()==Mode.DEBUG) System.err.println(colorTTL(reorderTTLBuffer(buffer, m.getCols())));
- if(m.getMode()==Mode.CONLL) {
- if (m.getCols().size() < 1) {
- LOG.info("No column names in cmd args, searching rdf comments..");
- List conllColumns = findColumnNamesInRDFBuffer(buffer);
- if (conllColumns.size()>0) {
- LOG.info("Using #global.comments from rdf");
- m.setCols(conllColumns);
- } else {
- LOG.info("Trying conll columns now..");
- conllColumns = CoNLLStreamExtractor.findFieldsFromComments(new BufferedReader(new StringReader(buffer.trim())), 1);
- if (conllColumns.size()>0) {
- m.setCols(conllColumns);
- }
- }
- }
- if (m.getCols().size() < 1)
- throw new IOException("-conll argument needs at least one COL to export!");
- else
- printSparql(buffer, columnsAsSelect(m.getCols()), new OutputStreamWriter(m.getOutputStream()));
- }
- if(m.getMode()==Mode.QUERY) printSparql(buffer, m.getSelect(), new OutputStreamWriter(m.getOutputStream()));
- if(m.getMode()==Mode.GRAMMAR) m.getOutputStream().println(extractCoNLLGraph(buffer,true));
- if(m.getMode()==Mode.SEMANTICS) m.getOutputStream().println(extractTermGraph(buffer,true));
- if(m.getMode()==Mode.GRAMMAR_SEMANTICS) {
- m.getOutputStream().println(extractCoNLLGraph(buffer,true));
- m.getOutputStream().println(extractTermGraph(buffer,false));
- }
- }
- }
-
- public static void main(String[] args) throws IOException {
- final CoNLLRDFFormatter formatter;
- try {
- formatter = new CoNLLRDFFormatterFactory().buildFromCLI(args);
- formatter.setInputStream(System.in);
- formatter.setOutputStream(System.out);
- } catch (ParseException e) {
- LOG.error(e);
- System.exit(1);
- return;
- }
- formatter.processSentenceStream();
- }
-}
+/*
+ * Copyright [2017] [ACoLi Lab, Prof. Dr. Chiarcos, Goethe University Frankfurt]
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.acoli.conll.rdf;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.jena.rdf.model.*; // Jena 2.x
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.acoli.fintan.core.FintanStreamHandler;
+import org.acoli.fintan.core.StreamWriter;
+import org.acoli.fintan.load.RDFStreamLoader;
+import org.apache.commons.cli.ParseException;
+import org.apache.jena.query.*;
+
+
+/** reads CoNLL-RDF from stdin, writes it formatted to stdout (requires a Un*x shell)
+ * this is basically for diagnostic purposes
+ * @author Christian Chiarcos {@literal chiarcos@informatik.uni-frankfurt.de}
+ * @author Christian Faeth {@literal faeth@em.uni-frankfurt.de}
+ */
+public class CoNLLRDFFormatter extends StreamWriter {
+
+ protected static Logger LOG = LogManager.getLogger(CoNLLRDFFormatter.class.getName());
+ public class Module {
+ private Mode mode = Mode.CONLLRDF;
+ private List cols = new ArrayList();
+ String select = "";
+ private PrintStream outputStream;
+
+ public Mode getMode() {
+ return mode;
+ }
+
+ public void setMode(Mode mode) {
+ this.mode = mode;
+ }
+
+ public List getCols() {
+ return cols;
+ }
+
+ public void setCols(List cols) {
+ this.cols = cols;
+ }
+
+ public String getSelect() {
+ return select;
+ }
+
+ public void setSelect(String select) {
+ this.select = select;
+ }
+
+ public PrintStream getOutputStream() {
+ if (outputStream != null) {
+ return outputStream;
+ } else {
+ // Retrieve outputStream of the enclosing Formatter
+ return new PrintStream(CoNLLRDFFormatter.this.getOutputStream());
+ }
+ }
+
+ public void setOutputStream(PrintStream outputStream) {
+ this.outputStream = outputStream;
+ }
+ }
+
+ public static enum Mode {
+ CONLL, CONLLRDF, DEBUG, QUERY, GRAMMAR, SEMANTICS, GRAMMAR_SEMANTICS
+ }
+
+ private List modules = new ArrayList();
+
+ public List getModules() {
+ return modules;
+ }
+ public Module addModule(Mode mode) {
+ Module module = new Module();
+ module.setMode(mode);
+ modules.add(module);
+ return module;
+ }
+
+ /** do some highlighting, but provide the full TTL data*/
+ public String colorTTL(String buffer) {
+ return buffer.replaceAll("(terms:[^ ]*)",ANSI.YLW_BK+"$1"+ANSI.RESET)
+ .replaceAll("(rdfs:label +)(\"[^\"]*\")","$1"+ANSI.CYAN+"$2"+ANSI.RESET)
+ .replaceAll("(nif:[^ ]*)",ANSI.YELLOW+"$1"+ANSI.RESET)
+ .replaceAll("(conll:[^ \n]*)([^;\n]*[;]?)",ANSI.CYAN_BK+ANSI.BRIGHTER+ANSI.BLUE+"$1"+ANSI.RESET+ANSI.CYAN_BK+ANSI.BRIGHTER+"$2"+ANSI.RESET);
+ }
+
+ /** default: do not return type assignments */
+ protected static String extractCoNLLGraph(String buffer) {
+ return extractCoNLLGraph(buffer,false);
+ }
+
+ /** buffer must be valid turtle, produces an extra column for terms: type assignments */
+ protected static String extractCoNLLGraph(String buffer, boolean includeTermConcepts) {
+ Model m = null;
+ try {
+ m = ModelFactory.createDefaultModel().read(new StringReader(buffer),null, "TTL");
+ } catch (org.apache.jena.riot.RiotException e) {
+ e.printStackTrace();
+ LOG.error("while reading:\n"+buffer);
+ }
+ Vector ids = new Vector();
+ Vector words = new Vector();
+ Vector annos = new Vector();
+ Vector depth = new Vector();
+ Vector edges = new Vector();
+ Vector headDir = new Vector();
+ Vector terms = new Vector();
+ Integer maxDepth = 0;
+ Integer maxEdgeLength = 0;
+ Integer maxIdLength = 0;
+ Integer maxWordLength = 0;
+ Integer maxTermLength = 0;
+
+ String word = null;
+ try {
+ word = QueryExecutionFactory.create(
+ "PREFIX nif: \n"+
+ "SELECT ?first WHERE { ?first a nif:Word. FILTER(NOT EXISTS{ [] nif:nextWord ?first })} LIMIT 1",
+ m).execSelect().next().get("?first").toString();
+ while(true) {
+ ids.add(word.replaceAll(".*[\\\\/#:]", ""));
+ maxIdLength=Math.max(maxIdLength, ids.get(ids.size()-1).length());
+ try {
+ words.add(
+ QueryExecutionFactory.create(
+ "PREFIX conll: \n"+
+ "SELECT ?word WHERE { <"+word+"> conll:WORD ?word } LIMIT 1",
+ m).execSelect().next().get("?word").toString());
+ } catch (NoSuchElementException e) {
+ LOG.warn("Warning: no conll:WORD (WORD column) found");
+ words.add("");
+ }
+ maxWordLength=Math.max(maxWordLength, words.get(words.size()-1).length());
+ String anno = "";
+ ResultSet annos_raw = QueryExecutionFactory.create(
+ "PREFIX conll: \n"+
+ "SELECT ?rel ?val WHERE { <"+word+"> ?rel ?val \n"
+ + "FILTER(contains(str(?rel),'http://ufal.mff.cuni.cz/conll2009-st/task-description.html#'))\n"
+ + "FILTER(?rel!=conll:HEAD && ?rel!=conll:EDGE && ?rel!=conll:WORD) } ORDER BY ASC(?rel)",
+ m).execSelect();
+ String rel = "";
+ while(annos_raw.hasNext()) {
+ QuerySolution next = annos_raw.next();
+ String nextRel = next.get("?rel").toString().replaceFirst(".*#","");
+ if(!rel.equals(nextRel))
+ anno=anno+
+ ANSI.BLUE+ANSI.ULINE+
+ nextRel+
+ ANSI.RESET+" ";
+ rel=nextRel;
+ anno=anno+
+ next.get("?val").toString().
+ replaceFirst("^http://purl.org/acoli/open-ie/(.*)$",ANSI.YLW_BK+"$1"+ANSI.RESET).
+ replaceFirst(".*#","")+
+ " ";
+ }
+
+ // we append OLiA annotations to CoNLL annotations
+ ResultSet olia_types= QueryExecutionFactory.create(
+ "PREFIX conll: \n"+
+ "SELECT ?concept WHERE { <"+word+"> a ?concept \n"
+ + "FILTER(contains(str(?concept),'http://purl.org/olia'))\n"
+ + "} ORDER BY ASC(?val)",
+ m).execSelect();
+ while(olia_types.hasNext())
+ anno=anno+
+ ANSI.RED+
+ olia_types.next().get("?concept").toString().replaceFirst("^.*/([^/]*)\\.(owl|rdf)[#/]","$1:")+
+ ANSI.RESET+" ";
+
+ // append OLiA features
+ ResultSet olia_feats= QueryExecutionFactory.create(
+ "PREFIX conll: \n"+
+ "SELECT ?rel ?concept WHERE { <"+word+"> ?rel ?val. ?val a ?concept.\n"
+ + "FILTER(contains(str(?rel),'http://purl.org/olia'))\n"
+ + "FILTER(contains(str(?concept),'http://purl.org/olia'))\n"
+ + "} ORDER BY ASC(?rel)",
+ m).execSelect();
+ while(olia_feats.hasNext()) {
+ QuerySolution next = olia_feats.next();
+ anno = anno+
+ ANSI.RED+ANSI.ULINE+
+ next.get("?rel").toString().replaceFirst("^.*/([^/]*)\\.(owl|rdf)[#/]","$1:")+
+ ANSI.RESET+"."+ANSI.RED+
+ next.get("?concept").toString().replaceFirst("^.*/([^/]*)\\.(owl|rdf)[#/]","$1:")+
+ ANSI.RESET+" ";
+ }
+
+ annos.add(anno);
+
+ String head = "";
+ try {
+ head =
+ QueryExecutionFactory.create(
+ "PREFIX conll: \n"+
+ "SELECT ?head WHERE { <"+word+"> conll:HEAD ?head} LIMIT 1",
+ m).execSelect().next().get("?head").toString();
+ if(Integer.parseInt(head.replaceAll("[^0-9]","")) < Integer.parseInt(word.replaceAll("[^0-9]","")))
+ headDir.add(" \\ ");
+ else
+ headDir.add(" / ");
+ } catch (NumberFormatException e) {
+ e.printStackTrace();
+ if(head.compareTo(word)<1) headDir.add(" \\ "); else headDir.add(" / ");
+ } catch (NoSuchElementException e) {
+ headDir.add(" ");
+ }
+
+ try {
+ depth.add(
+ Integer.parseInt(QueryExecutionFactory.create(
+ "PREFIX conll: \n"+
+ "SELECT (COUNT(DISTINCT ?head) AS ?depth) WHERE { <"+word+"> conll:HEAD+ ?head }",
+ m).execSelect().next().get("?depth").toString().replaceFirst("^\"?([0-9]+)[\\^\"].*","$1")));
+ } catch(NoSuchElementException e) {
+ if(depth.size()==0) depth.add(1);
+ else depth.add(depth.get(depth.size()-1));
+ }
+ maxDepth=Math.max(maxDepth, depth.get(depth.size()-1));
+
+
+ try { // return the longest edge
+ edges.add(
+ QueryExecutionFactory.create(
+ "PREFIX conll: \n"+
+ "PREFIX fn: \n"+
+ "SELECT ?edge ?length WHERE { <"+word+"> conll:EDGE ?edge. BIND(fn:string-length(?edge) AS ?length) } ORDER BY DESC(?length) LIMIT 1",
+ m).execSelect().next().get("?edge").toString());
+ } catch(NoSuchElementException e) {
+ edges.add("");
+ }
+ maxEdgeLength=Math.max(maxEdgeLength,edges.get(edges.size()-1).length());
+
+ String term = "";
+ if(includeTermConcepts) {
+ ResultSet terms_raw = QueryExecutionFactory.create(
+ "PREFIX conll: \n"+
+ "SELECT ?term WHERE { <"+word+"> a ?term \n"
+ + "FILTER(contains(str(?term),'http://purl.org/acoli/open-ie/'))\n"
+ + " } ORDER BY ASC(?term)",
+ m).execSelect();
+ while(terms_raw.hasNext())
+ term=term+terms_raw.next().get("?term").toString().
+ replaceFirst("http://purl.org/acoli/open-ie/","")+" ";
+ //replaceFirst("http://purl.org/acoli/open-ie/","terms:")+" ";
+ }
+ terms.add(term.trim());
+ maxTermLength=Math.max(maxTermLength, term.trim().length());
+
+ word = QueryExecutionFactory.create(
+ "PREFIX nif: \n"+
+ "SELECT ?next WHERE { <"+word+"> nif:nextWord ?next } LIMIT 1",
+ m).execSelect().next().get("?next").toString();
+ }
+ } catch (NoSuchElementException e) {
+ } catch(Exception e) {
+ e.printStackTrace();
+ }
+
+ String result = "";
+
+
+ for(int i = 0; i0;j--)
+ result=result+" .";
+ result=result+ANSI.RESET;
+ result=result+headDir.get(i);
+ result=result+edges.get(i);
+ for(int j = maxDepth-depth.get(i);j>0;j--)
+ if(depth.get(i)>1) result=result+"--"; else result=result+" ";
+ for(int j = edges.get(i).length();j1) result=result+"-"; else result=result+" ";
+ result=result+" "+words.get(i);
+ for(int j = words.get(i).length(); j\n"
+ + "PREFIX conll: \n"
+ + "SELECT ?w ?word (COUNT(DISTINCT ?pre) AS ?pos)\n"
+ + "WHERE {\n"
+ + "?w conll:WORD ?word.\n"
+ + "?pre nif:nextWord* ?w.\n"
+ + "} GROUP BY ?w ?word ORDER BY ASC(?pos)",m).execSelect();
+ while(sentence.hasNext())
+ result=result+sentence.next().get("?word")+" ";
+
+ // write result set
+ ResultSet semgraph = QueryExecutionFactory.create(
+ "PREFIX rdfs: \n"
+ +"PREFIX xsd: \n"
+ +"SELECT DISTINCT ?s ?sl ?r ?o ?ol ?in ?out\n"
+ +"WHERE { "
+ + "?s ?r [].\n"
+ + "OPTIONAL { ?s ?r ?o }. \n" // ?o can be blank
+ + "FILTER(contains(concat(str(?r),str(?o)),'http://purl.org/acoli/open-ie/') &&\n"
+ + " !contains(str(?r),'http://ufal.mff.cuni.cz/conll2009-st/task-description.html#'))\n"
+ + "OPTIONAL {?s rdfs:label ?sl }\n"
+ + "OPTIONAL {?o rdfs:label ?ol }\n"
+ + "BIND(xsd:integer(REPLACE(STR(?s),'[^0-9]','')) AS ?snr)\n"
+ + "BIND(xsd:integer(REPLACE(STR(?o),'[^0-9]','')) AS ?onr)\n"
+ + "{ FILTER(!BOUND(?snr)) BIND(?snr AS ?nr) } UNION"
+ + "{ FILTER(BOUND(?snr)) BIND(?onr AS ?nr) } \n"
+ + "OPTIONAL { SELECT ?s (COUNT(DISTINCT *) AS ?in)\n"
+ + " WHERE { ?sin ?rin ?s FILTER(!ISBLANK(?sin)) FILTER(contains(str(?rin),'http://purl.org/acoli/open-ie/')) } GROUP BY ?s \n"
+ + "}"
+ + "OPTIONAL { SELECT ?s (COUNT(DISTINCT *) AS ?out)\n"
+ + " WHERE { ?s ?rout ?sout FILTER(!ISBLANK(?sout)) FILTER(contains(str(?rout),'http://purl.org/acoli/open-ie/'))} GROUP BY ?s \n"
+ + "}"
+ + "}"
+ + "ORDER BY ASC(?nr) ASC(?snr) ASC(?onr) ?r ?s ?o",
+ m).execSelect();
+ while(semgraph.hasNext()) {
+ QuerySolution next = semgraph.next();
+ RDFNode sNode = next.get("?s");
+ String nextS = sNode.toString().replaceAll(".*[#/]","");
+ if(!sNode.isURIResource()) nextS="[]";
+ if(next.get("?sl")!=null) nextS=nextS+" "+ANSI.CYAN+"\""+next.get("?sl")+"\""+ANSI.RESET;
+ if(!nextS.equals(s)) {
+ result=result+"\n"+nextS+" ("+
+ ("0"+next.get("?in")).replaceFirst("[^0-9].*","").replaceFirst("^0*([^0])","$1")+" > node > "+
+ ("0"+next.get("?out")).toString().replaceFirst("[^0-9].*","").replaceFirst("^0*([^0])","$1")+")";
+ }
+ String nextR = next.get("?r").toString()
+ .replaceAll("http://ufal.mff.cuni.cz/conll2009-st/task-description.html#(.*)$",ANSI.BLUE+ANSI.ULINE+"$1"+ANSI.RESET)
+ .replaceAll("http://purl.org/acoli/open-ie/(.*)",ANSI.YLW_BK+"terms:$1"+ANSI.RESET)
+ .replaceAll("http://www.w3.org/1999/02/22-rdf-syntax-ns#type","a");
+
+ String nextO = next.get("?o").toString()
+ .replaceAll("http://purl.org/acoli/open-ie/(.*)",ANSI.YLW_BK+"terms:$1"+ANSI.RESET)
+ .replaceAll("[^ \t]*[#/]","");
+ if(next.get("?ol")!=null)
+ nextO=nextO+" "+ANSI.CYAN+"\""+next.get("?ol")+"\""+ANSI.RESET;
+
+ if(!nextR.equals("a") || includeTermConcepts==true) {
+ if(!nextS.equals(s) || !nextR.equals(r))
+ result=result+"\n\t"+nextR;
+ else if(!nextO.equals(o)) result=result+"; ";
+ if(!nextS.equals(s) || !nextR.equals(r) || !nextO.equals(o)) {
+ result=result+" "+nextO;
+ }
+ }
+ s=nextS;
+ r=nextR;
+ o=nextO;
+ }
+ } catch (NoSuchElementException e) {
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ return result+"\n";
+ }
+
+ /** require that every line starts with a subject, sort: @ (prefix) & # (comment) > lines, lines sorted lexiconumerically, i.e., normalize length of integers (regardless of position) before sorting */
+ protected static String reorderTTLBuffer(String buffer, List cols) {
+ String result ="";
+ try {
+ BufferedReader in = new BufferedReader(new StringReader(buffer));
+ Hashtable key2line = new Hashtable();
+ String line;
+ while((line=in.readLine())!=null) {
+ line=line.trim();
+ if(line.startsWith("@")) result=result+line+"\n"; else
+ if(line.startsWith("#")) result=result+line+"\n"; else
+ if(!line.equals("")) {
+ //reorder columns according to user list.
+ String orderedLine = "";
+ List statements = new ArrayList(Arrays.asList(line.substring(0, line.lastIndexOf(".")-1).split(";\\s*\t"))); //TODO: only consider ; not ";"
+ List columns = new ArrayList();
+ // Subject is always first. Change if complications occur.
+ if (statements.get(0).contains("nif:Word")) {
+ //do rdf:type reorder
+ List concepts = new ArrayList(Arrays.asList(statements.get(0).split(",")));
+ String[] subject = concepts.get(0).split("\\sa\\s");
+ if (subject.length == 2) {
+ orderedLine += subject[0] + " a nif:Word";
+ if (!subject[1].contains("nif:Word")) {
+ concepts.set(0, subject[1]);
+ } else {
+ concepts.remove(0);
+ }
+ } else {
+ orderedLine += concepts.get(0);
+ concepts.remove(0);
+ }
+ for (String concept:concepts) {
+ if (concept.contains("nif:Word")) continue;
+ orderedLine += ", " + concept.trim();
+ }
+ } else {
+ orderedLine = statements.get(0).trim();
+ }
+ statements.remove(0);
+ //do column reorder
+ columns.add("nif:Word");
+ columns.add("conll:WORD");
+ columns.addAll(cols);
+ for (String col:columns) {
+ for (int i = 0; i < statements.size();i++) {
+ if (statements.get(i).contains(col)) {
+ orderedLine += "; " + statements.get(i).trim();
+ statements.remove(i);
+ break;
+ }
+ }
+ }
+ //add rest of columns to the end
+ String nifnext = "";
+ for (int i = 0; i < statements.size();i++) {
+ if (statements.get(i).contains("nif:nextWord"))
+ nifnext = "; " + statements.get(i).trim();
+ else
+ orderedLine += "; " + statements.get(i).trim();
+ }
+ if (!orderedLine.equals("")) {
+ orderedLine += nifnext + " .";
+ line = orderedLine;
+ }
+
+
+ //reorder lines
+ String tmp=line.replaceAll("\t"," ").replaceAll("([^0-9])([0-9])","$1\t$2").replaceAll("([0-9])([^0-9])","$1\t$2"); // key \t-split
+ String key="";
+ for(String s : tmp.split("\t")) {
+ if(s.matches("^[0-9]+$"))
+ while(s.length()<64) s="0"+s;
+ key=key+s;
+ }
+ key2line.put(key,line);
+ }
+ }
+ List keys = new ArrayList(key2line.keySet());
+ Collections.sort(keys);
+ for(String key: keys)
+ result=result+key2line.get(key)+"\n";
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ return result;
+ }
+
+ /** note: the last column must contain literal values, not HEAD */
+ public static String columnsAsSelect(List cols) {
+ String select = ""
+ + "PREFIX nif: \n"
+ + "PREFIX rdfs: \n"
+ + "PREFIX conll: \n"
+ + "PREFIX xsd: \n"
+
+ + "SELECT ";
+ for (String col:cols) {
+ select += "?"+col+" ";
+ }
+
+ select += "{\n";
+ select += " SELECT \n";
+ select += " ?sid ?wid \n";
+
+ for (String col:cols) {
+ select += " (group_concat(?"+col+"s;separator='|') as ?"+col+")\n";
+ }
+
+ String lastCol = cols.get(cols.size()-1);
+
+ select += " WHERE {\n";
+ select += " ?word a nif:Word .\n";
+ select += " {\n";
+ select += " SELECT ?word (count(distinct ?preS) as ?sid) (count(distinct ?pre) as ?wid)\n";
+ select += " WHERE {\n";
+ select += " ?word a nif:Word .\n";
+ select += " ?pre nif:nextWord* ?word .\n";
+ select += " ?word conll:HEAD+ ?s. ?s a nif:Sentence. ?preS nif:nextSentence* ?s.\n";
+ select += " }\n";
+ select += " group by ?word\n";
+ select += " }\n";
+ for (String col:cols) {
+ if(col.equals(lastCol)) { // cast to string
+ if (col.equals("HEAD")) { //TODO: streamline! only difference to statement below is binding to HEADa instead of HEADs
+ select += " OPTIONAL {\n";
+ select += " ?word conll:HEAD ?headurl .\n";
+ select += " bind(replace(str(?headurl), '^.*s[0-9]+_([0-9]+)$', '$1') as ?HEADa) .\n";
+ select += " } .\n";
+ } else {
+ select += " OPTIONAL{?word conll:"+col+" ?"+col+"_raw .";
+ select += " BIND(str(?"+col+"_raw) as ?"+col+"a)} .\n";
+ }
+ select += " BIND(concat(if(bound(?"+col+"a),?"+col+"a,'_'),\n";
+ select += " IF(EXISTS { ?word nif:nextWord [] }, '', '\\n')) as ?"+col+"s)\n";
+ // we append a linebreak to the value of the last column to generate sentence breaks within a local graph
+ } else if (col.equals("HEAD")) {
+ select += " OPTIONAL {\n";
+ select += " ?word conll:HEAD ?headurl .\n";
+ select += " bind(replace(str(?headurl), '^.*s[0-9]+_([0-9]+)$', '$1') as ?HEADs) .\n";
+ select += " } .\n";
+ } else {
+ select += " OPTIONAL{?word conll:"+col+" ?"+col+"_raw .";
+ select += " BIND(str(?"+col+"_raw) as ?"+col+"s)} .\n"; // cast to string
+ }
+ }
+ select += " }\n";
+ select += " group by ?word ?sid ?wid\n";
+ select += " order by ?sid ?wid\n";
+ select += "}\n";
+
+ return select;
+ }
+
+ /** run either SELECT statement (cf. https://jena.apache.org/documentation/query/app_api.html) and return CoNLL-like TSV or just TTL
+ * Note: this CoNLL-like export has limitations, of course: it will export one property per column, hence, collapsed dependencies or
+ * SRL annotations cannot be reconverted */
+ public static void printSparql(String buffer, String select, Writer out) throws IOException {
+ Model m = ModelFactory.createDefaultModel().read(new StringReader(buffer),null, "TTL");
+ String selectComments = "PREFIX nif: \n"
+ + "PREFIX rdfs: \n"
+ + "SELECT ?c WHERE {?x a nif:Sentence . ?x rdfs:comment ?c}";
+ QueryExecution qexec = QueryExecutionFactory.create(selectComments, m);
+ ResultSet results = qexec.execSelect();
+ Set comments = new HashSet<>();
+ boolean hasGlobalComments = false;
+ while (results.hasNext()) {
+ for (String result : results.next().getLiteral("c").toString().split("\\\\n")) {
+ if (result.trim().matches("^\\s?global\\.columns\\s?=.*") )
+ hasGlobalComments = true;
+ else
+ comments.add(result);
+ }
+ }
+ qexec = QueryExecutionFactory.create(select, m);
+ results = qexec.execSelect();
+ List cols = results.getResultVars();
+ BufferedReader in = new BufferedReader(new StringReader(buffer));
+ Hashtable key2line = new Hashtable();
+ String line;
+ while((line=in.readLine())!=null) {
+ if (line.trim().startsWith("#")) {
+ for (String splitComment : line.split("\t")) {
+ if (splitComment.trim().matches("^#\\s?global\\.columns\\s?=.*"))
+ hasGlobalComments = true;
+ else
+ comments.add(splitComment.replace("#",""));
+ }
+ }
+
+ }
+ if (hasGlobalComments)
+ out.write("# global.columns = " + String.join(" ", cols) + "\n");
+ else {
+ out.write("# global.columns = "+String.join(" ", cols)+"\n");
+ }
+ for (String comment : comments) {
+ out.write("#"+comment+"\n");
+ }
+
+ while(results.hasNext()) {
+ QuerySolution sol = results.next();
+ for(String col : cols)
+ if(sol.get(col)==null) out.write("_\t"); // CoNLL practice
+ else out.write(sol.get(col)+"\t");
+ out.write("\n");
+ out.flush();
+ }
+ out.write("\n");
+ out.flush();
+ }
+
+
+ /**
+ * Searches a string buffer that is expected to represent a sentence for any
+ * rdfs:comment
properties and checks them for a CoNLL-U Plus like global.columns comments.
+ * Defaults to an empty columnNames Array if not present.
+ * @param buffer a string buffer representing a sentence in conll-rdf
+ * @return ArrayList of column names, empty if not present.
+ */
+ private List findColumnNamesInRDFBuffer(String buffer) {
+ List columnNames = new ArrayList<>();
+ Model m = ModelFactory.createDefaultModel().read(new StringReader(buffer),null, "TTL");
+ String selectComments = "PREFIX nif: \n"
+ + "PREFIX rdfs: \n"
+ + "SELECT ?c WHERE {?x a nif:Sentence . ?x rdfs:comment ?c}";
+ QueryExecution qexec = QueryExecutionFactory.create(selectComments, m);
+ ResultSet results = qexec.execSelect();
+ while (results.hasNext()) {
+ String[] comments = results.next().getLiteral("c").toString().split("\\\\n");
+ for (String comment : comments) {
+ if (comment.matches("^\\s?global\\.columns\\s?=.*")) {
+ columnNames.addAll(Arrays.asList(comment.trim()
+ .replaceFirst("\\s?global\\.columns\\s?=", "")
+ .trim().split(" |\t")));
+ LOG.info("Found global columns comment in rdfs:comment");
+ return columnNames;
+ }
+ }
+ }
+ return columnNames;
+ }
+
+ // FIXME @Override
+ protected void processSentenceStream() throws IOException, InterruptedException {
+ Model model;
+ String buffer;
+ while ((model = getInputStream().read()) != null) {
+ buffer = CoNLLRDFUtil.conllRdfModel2String(model);
+ processBuffer(buffer);
+ }
+ }
+
+ private void processBuffer(String buffer) throws IOException {
+ for (Module m:modules) {
+ if(m.getMode()==Mode.CONLLRDF) m.getOutputStream().println(reorderTTLBuffer(buffer, m.getCols()));
+ if(m.getMode()==Mode.DEBUG) System.err.println(colorTTL(reorderTTLBuffer(buffer, m.getCols())));
+ if(m.getMode()==Mode.CONLL) {
+ if (m.getCols().size() < 1) {// no column args supplied
+ LOG.info("No column names in cmd args, searching rdf comments..");
+ List conllColumns = findColumnNamesInRDFBuffer(buffer);
+ if (conllColumns.size() > 0) {
+ LOG.info("Using #global.comments from rdf");
+ m.setCols(conllColumns);
+ } else {
+ LOG.info("Trying conll columns now..");
+ conllColumns = CoNLLStreamExtractor.findFieldsFromComments(new BufferedReader(new StringReader(buffer.trim())), 1);
+ if (conllColumns.size() > 0) {
+ m.setCols(conllColumns);
+ }
+ }
+ }
+ if (m.getCols().size() < 1) {
+ LOG.info("Supply column names some way! (-conll arg, global.columns or rdf comments)");
+ }
+ else
+ printSparql(buffer, columnsAsSelect(m.getCols()), new OutputStreamWriter(m.getOutputStream()));
+ }
+ if(m.getMode()==Mode.QUERY) printSparql(buffer, m.getSelect(), new OutputStreamWriter(m.getOutputStream()));
+ if(m.getMode()==Mode.GRAMMAR) m.getOutputStream().println(extractCoNLLGraph(buffer,true));
+ if(m.getMode()==Mode.SEMANTICS) m.getOutputStream().println(extractTermGraph(buffer,true));
+ if(m.getMode()==Mode.GRAMMAR_SEMANTICS) {
+ m.getOutputStream().println(extractCoNLLGraph(buffer,true));
+ m.getOutputStream().println(extractTermGraph(buffer,false));
+ }
+ }
+ }
+
+ @Override
+ public void run() {
+ try {
+ processSentenceStream();
+ } catch (IOException | InterruptedException e) {
+ LOG.error(e);
+ System.exit(1);
+ }
+ }
+
+ @Override
+ public void start() {
+ run();
+ }
+
+ public static void main(String[] args) throws IOException {
+ final CoNLLRDFFormatter formatter;
+ final FintanStreamHandler stream = new FintanStreamHandler();
+ final RDFStreamLoader streamLoader = new RDFStreamLoader();
+ try {
+ formatter = new CoNLLRDFFormatterFactory().buildFromCLI(args);
+ streamLoader.setInputStream(System.in);
+ streamLoader.setOutputStream(stream);
+ formatter.setInputStream(stream);
+ formatter.setOutputStream(System.out);
+ } catch (ParseException e) {
+ LOG.error(e);
+ System.exit(1);
+ return;
+ }
+ new Thread(formatter).start();
+ new Thread(streamLoader).start();
+ }
+}
diff --git a/src/main/java/org/acoli/conll/rdf/CoNLLRDFFormatterFactory.java b/src/main/java/org/acoli/conll/rdf/CoNLLRDFFormatterFactory.java
index b79e93c..6a7abda 100644
--- a/src/main/java/org/acoli/conll/rdf/CoNLLRDFFormatterFactory.java
+++ b/src/main/java/org/acoli/conll/rdf/CoNLLRDFFormatterFactory.java
@@ -1,253 +1,254 @@
-package org.acoli.conll.rdf;
-
-import static org.acoli.conll.rdf.CoNLLRDFCommandLine.readString;
-import static org.acoli.conll.rdf.CoNLLRDFCommandLine.readUrl;
-import static org.acoli.conll.rdf.CoNLLRDFManager.parseConfAsOutputStream;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.nio.file.Paths;
-import java.util.Arrays;
-import java.util.List;
-
-import com.fasterxml.jackson.core.type.TypeReference;
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.node.ObjectNode;
-
-import org.acoli.conll.rdf.CoNLLRDFFormatter.Mode;
-import org.acoli.conll.rdf.CoNLLRDFFormatter.Module;
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.ParseException;
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-
-public class CoNLLRDFFormatterFactory extends CoNLLRDFComponentFactory {
- static Logger LOG = LogManager.getLogger(CoNLLRDFFormatterFactory.class);
-
- @Override
- public CoNLLRDFFormatter buildFromCLI(String[] args) throws IOException, ParseException {
- final CoNLLRDFFormatter formatter = new CoNLLRDFFormatter();
- final CoNLLRDFCommandLine conllCli = new CoNLLRDFCommandLine(
- "CoNLLRDFFormatter [-rdf [COLS]] [-conll COLS] [-debug] [-grammar] [-semantics] [-query SPARQL]",
- "read TTL from stdin => format CoNLL-RDF or extract and highlight CoNLL (namespace conll:) and semantic (namespace terms:) subgraphs\ndefaults to -rdf if no options are selected",
- new Option[] {
- // Define cli options in the correct order for the help-message
- Option.builder("rdf").hasArgs().optionalArg(true)
- .desc("write formatted CoNLL-RDF to stdout (sorted by list of CoNLL COLS, if provided)")
- .build(),
- Option.builder("conll").hasArgs().optionalArg(true)
- .desc("write formatted CoNLL to stdout (only specified COLS)").build(),
- new Option("debug", false, "write formatted, color-highlighted full turtle to stderr"),
- new Option("grammar", false, "write CoNLL data structures to stdout"),
- new Option("semantics", false,
- "write semantic graph to stdout.\nif combined with -grammar, skip type assignments"),
- new Option("query", true, "write TSV generated from SPARQL statement to stdout"),
- new Option("sparqltsv", true, "deprecated: use -query instead") },
- LOG);
- // TODO which args are optional?
- final CommandLine cmd = conllCli.parseArgs(args);
-
- Module module;
-
- if (cmd.hasOption("conll")) {
- module = formatter.addModule(Mode.CONLL);
- String[] optionValues = cmd.getOptionValues("conll");
- if (optionValues != null) {
- module.setCols(Arrays.asList(optionValues));
- }
- }
- if (cmd.hasOption("rdf")) {
- module = formatter.addModule(Mode.CONLLRDF);
- String[] optionValues = cmd.getOptionValues("rdf");
- if (optionValues != null) {
- module.setCols(Arrays.asList(optionValues));
- }
- }
- if (cmd.hasOption("debug")) {
- module = formatter.addModule(Mode.DEBUG);
- module.setOutputStream(System.err);
- }
-
- if (cmd.hasOption("sparqltsv")) {
- LOG.warn("Option -sparqltsv has been deprecated in favor of -query");
- module = formatter.addModule(Mode.QUERY);
- module.setSelect(parseSparqlTSVOptionValues(cmd.getOptionValues("sparqltsv")));
- }
- if (cmd.hasOption("query")) {
- module = formatter.addModule(Mode.QUERY);
- module.setSelect(parseSparqlTSVOptionValues(cmd.getOptionValues("query")));
- }
- if (cmd.hasOption("query") && cmd.hasOption("sparqltsv")) {
- throw new ParseException("Tried to combine deprecated -sparqltsv and -query");
- }
-
- if (cmd.hasOption("grammar") && !cmd.hasOption("semantics")) {
- module = formatter.addModule(Mode.GRAMMAR);
- }
- if (cmd.hasOption("semantics") && !cmd.hasOption("grammar")) {
- module = formatter.addModule(Mode.SEMANTICS);
- }
- if (cmd.hasOption("semantics") && cmd.hasOption("grammar")) {
- module = formatter.addModule(Mode.GRAMMAR_SEMANTICS);
- }
-
- // if no modules were added, provide the default option
- if (formatter.getModules().isEmpty()) {
- LOG.info("No Option selected. Defaulting to Mode CoNLL-RDF");
- module = formatter.addModule(Mode.CONLLRDF);
- }
-
- return formatter;
- }
-
- static String parseSparqlTSVOptionValues(String[] optionValues) throws IOException, ParseException {
- // FIXME Legacy Code
- final String optionValue;
-
- if (optionValues.length == 1) {
- optionValue = optionValues[0];
- } else if (optionValues.length == 0) {
- // TODO this code should not be reachable
- throw new ParseException("Option-Value for -sparqltsv is an empty string.");
- } else {
- // because queries may be parsed by the shell (Cygwin)
- optionValue = String.join(" ", optionValues);
- }
-
- LOG.debug("Parsing Option-Value for -sparqltsv: " + optionValue);
-
- if (new File(optionValue).exists()) {
- LOG.debug("Attempting to read query from file");
- return readString(Paths.get(optionValue));
- }
-
- try {
- URL url = new URL(optionValue);
- LOG.debug("Attempting to read query from URL");
- return readUrl(url);
- } catch (MalformedURLException e) {
- LOG.debug(e);
- }
-
- // TODO consider verifying the output
- LOG.debug("Returning unchanged Option Value as Query");
- return optionValue;
- }
-
- static String parseQueryOptionValues(String[] optionValues) throws IOException, ParseException {
- final String optionValue;
- LOG.debug("Parsing Option-Value for -query");
- // TODO only URL and File
-
- if (optionValues.length == 1) {
- optionValue = optionValues[0];
- } else if (optionValues.length == 0) {
- // TODO this code should not be reachable
- optionValue = "";
- return optionValue;
- } else {
- LOG.error("Parsing multiple queries in one operation is not supported at the moment.");
- throw new ParseException("Expected a single file-path or URL as argument for query. Got "
- + optionValues.length + ":\n" + String.join(" ", optionValues));
- }
-
- if (new File(optionValue).exists()) {
- LOG.debug("Attempting to read query from file");
- return readString(Paths.get(optionValue));
- }
-
- try {
- URL url = new URL(optionValue);
- LOG.debug("Attempting to read query from URL");
- return readUrl(url);
- } catch (MalformedURLException e) {
- LOG.debug(e);
- }
-
- throw new ParseException("Failed to parse Option-Value as file-path or URL: " + optionValue);
- }
-
- @Override
- public CoNLLRDFFormatter buildFromJsonConf(ObjectNode conf) throws IOException {
- CoNLLRDFFormatter formatter = new CoNLLRDFFormatter();
-
- if (conf.path("output").isTextual()) {
- PrintStream output = parseConfAsOutputStream(conf.get("output").asText());
- formatter.setOutputStream(output);
- }
- for (JsonNode modConf : conf.withArray("modules")) {
- addModule(formatter, modConf);
- }
- if (formatter.getModules().size() == 0) {
- formatter.addModule(Mode.CONLLRDF);
- }
- return formatter;
- }
-
- private Module addModule(CoNLLRDFFormatter formatter, JsonNode modConf)
- throws IOException {
- ObjectMapper mapper = new ObjectMapper();
-
- Mode mode;
- JsonNode columnsArray = null;
- String select = "";
- PrintStream outputStream = null;
- String modeString = modConf.get("mode").asText();
- switch (modeString) {
- case "RDF":
- case "CONLLRDF":
- mode = Mode.CONLLRDF;
- columnsArray = modConf.withArray("columns");
- break;
- case "CONLL":
- mode = Mode.CONLL;
- columnsArray = modConf.withArray("columns");
- break;
- case "DEBUG":
- mode = Mode.DEBUG;
- outputStream = System.err;
- break;
- case "SPARQLTSV":
- LOG.warn("Mode SPARQLTSV is deprecated, please use QUERY instead.");
- case "QUERY":
- mode = Mode.QUERY;
- // TODO check URI
- select = readString(Paths.get(modConf.get("select").asText()));
- // TODO Attach context to IOExceptions thrown by readString
- break;
- case "GRAMMAR":
- mode = Mode.GRAMMAR;
- break;
- case "SEMANTICS":
- mode = Mode.SEMANTICS;
- break;
- case "GRAMMAR+SEMANTICS":
- mode = Mode.GRAMMAR_SEMANTICS;
- break;
-
- default:
- throw new IllegalArgumentException("Unknown mode: " + modeString);
- }
- Module module = formatter.addModule(mode);
-
- // select is either "" or a selectQuery as String
- module.setSelect(select);
- // convert JSON array to Java List
- if (columnsArray != null) {
- List columnList = mapper.convertValue(columnsArray, new TypeReference>() {});
- module.setCols(columnList);
- }
- // Set outputStream, if config has a property "output"
- if (modConf.path("output").isTextual()) {
- outputStream = parseConfAsOutputStream(modConf.get("output").asText());
- }
- // outputStream can be null or System.err
- module.setOutputStream(outputStream);
- return module;
- }
-}
+package org.acoli.conll.rdf;
+
+import static org.acoli.conll.rdf.CoNLLRDFCommandLine.readString;
+import static org.acoli.conll.rdf.CoNLLRDFCommandLine.readUrl;
+import static org.acoli.conll.rdf.CoNLLRDFManager.parseConfAsOutputStream;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.List;
+
+import com.fasterxml.jackson.core.type.TypeReference;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+
+import org.acoli.conll.rdf.CoNLLRDFFormatter.Mode;
+import org.acoli.conll.rdf.CoNLLRDFFormatter.Module;
+import org.acoli.fintan.core.FintanStreamComponentFactory;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.ParseException;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+public class CoNLLRDFFormatterFactory implements FintanStreamComponentFactory {
+ static Logger LOG = LogManager.getLogger(CoNLLRDFFormatterFactory.class);
+
+ @Override
+ public CoNLLRDFFormatter buildFromCLI(String[] args) throws IOException, ParseException {
+ final CoNLLRDFFormatter formatter = new CoNLLRDFFormatter();
+ final CoNLLRDFCommandLine conllCli = new CoNLLRDFCommandLine(
+ "CoNLLRDFFormatter [-rdf [COLS]] [-conll COLS] [-debug] [-grammar] [-semantics] [-query SPARQL]",
+ "read TTL from stdin => format CoNLL-RDF or extract and highlight CoNLL (namespace conll:) and semantic (namespace terms:) subgraphs\ndefaults to -rdf if no options are selected",
+ new Option[] {
+ // Define cli options in the correct order for the help-message
+ Option.builder("rdf").hasArgs().optionalArg(true)
+ .desc("write formatted CoNLL-RDF to stdout (sorted by list of CoNLL COLS, if provided)")
+ .build(),
+ Option.builder("conll").hasArgs().optionalArg(true)
+ .desc("write formatted CoNLL to stdout (only specified COLS)").build(),
+ new Option("debug", false, "write formatted, color-highlighted full turtle to stderr"),
+ new Option("grammar", false, "write CoNLL data structures to stdout"),
+ new Option("semantics", false,
+ "write semantic graph to stdout.\nif combined with -grammar, skip type assignments"),
+ new Option("query", true, "write TSV generated from SPARQL statement to stdout"),
+ new Option("sparqltsv", true, "deprecated: use -query instead") },
+ LOG);
+ // TODO which args are optional?
+ final CommandLine cmd = conllCli.parseArgs(args);
+
+ Module module;
+
+ if (cmd.hasOption("conll")) {
+ module = formatter.addModule(Mode.CONLL);
+ String[] optionValues = cmd.getOptionValues("conll");
+ if (optionValues != null) {
+ module.setCols(Arrays.asList(optionValues));
+ }
+ }
+ if (cmd.hasOption("rdf")) {
+ module = formatter.addModule(Mode.CONLLRDF);
+ String[] optionValues = cmd.getOptionValues("rdf");
+ if (optionValues != null) {
+ module.setCols(Arrays.asList(optionValues));
+ }
+ }
+ if (cmd.hasOption("debug")) {
+ module = formatter.addModule(Mode.DEBUG);
+ module.setOutputStream(System.err);
+ }
+
+ if (cmd.hasOption("sparqltsv")) {
+ LOG.warn("Option -sparqltsv has been deprecated in favor of -query");
+ module = formatter.addModule(Mode.QUERY);
+ module.setSelect(parseSparqlTSVOptionValues(cmd.getOptionValues("sparqltsv")));
+ }
+ if (cmd.hasOption("query")) {
+ module = formatter.addModule(Mode.QUERY);
+ module.setSelect(parseSparqlTSVOptionValues(cmd.getOptionValues("query")));
+ }
+ if (cmd.hasOption("query") && cmd.hasOption("sparqltsv")) {
+ throw new ParseException("Tried to combine deprecated -sparqltsv and -query");
+ }
+
+ if (cmd.hasOption("grammar") && !cmd.hasOption("semantics")) {
+ module = formatter.addModule(Mode.GRAMMAR);
+ }
+ if (cmd.hasOption("semantics") && !cmd.hasOption("grammar")) {
+ module = formatter.addModule(Mode.SEMANTICS);
+ }
+ if (cmd.hasOption("semantics") && cmd.hasOption("grammar")) {
+ module = formatter.addModule(Mode.GRAMMAR_SEMANTICS);
+ }
+
+ // if no modules were added, provide the default option
+ if (formatter.getModules().isEmpty()) {
+ LOG.info("No Option selected. Defaulting to Mode CoNLL-RDF");
+ module = formatter.addModule(Mode.CONLLRDF);
+ }
+
+ return formatter;
+ }
+
+ static String parseSparqlTSVOptionValues(String[] optionValues) throws IOException, ParseException {
+ // FIXME Legacy Code
+ final String optionValue;
+
+ if (optionValues.length == 1) {
+ optionValue = optionValues[0];
+ } else if (optionValues.length == 0) {
+ // TODO this code should not be reachable
+ throw new ParseException("Option-Value for -sparqltsv is an empty string.");
+ } else {
+ // because queries may be parsed by the shell (Cygwin)
+ optionValue = String.join(" ", optionValues);
+ }
+
+ LOG.debug("Parsing Option-Value for -sparqltsv: " + optionValue);
+
+ if (new File(optionValue).exists()) {
+ LOG.debug("Attempting to read query from file");
+ return readString(Paths.get(optionValue));
+ }
+
+ try {
+ URL url = new URL(optionValue);
+ LOG.debug("Attempting to read query from URL");
+ return readUrl(url);
+ } catch (MalformedURLException e) {
+ LOG.debug(e);
+ }
+
+ // TODO consider verifying the output
+ LOG.debug("Returning unchanged Option Value as Query");
+ return optionValue;
+ }
+
+ static String parseQueryOptionValues(String[] optionValues) throws IOException, ParseException {
+ final String optionValue;
+ LOG.debug("Parsing Option-Value for -query");
+ // TODO only URL and File
+
+ if (optionValues.length == 1) {
+ optionValue = optionValues[0];
+ } else if (optionValues.length == 0) {
+ // TODO this code should not be reachable
+ optionValue = "";
+ return optionValue;
+ } else {
+ LOG.error("Parsing multiple queries in one operation is not supported at the moment.");
+ throw new ParseException("Expected a single file-path or URL as argument for query. Got "
+ + optionValues.length + ":\n" + String.join(" ", optionValues));
+ }
+
+ if (new File(optionValue).exists()) {
+ LOG.debug("Attempting to read query from file");
+ return readString(Paths.get(optionValue));
+ }
+
+ try {
+ URL url = new URL(optionValue);
+ LOG.debug("Attempting to read query from URL");
+ return readUrl(url);
+ } catch (MalformedURLException e) {
+ LOG.debug(e);
+ }
+
+ throw new ParseException("Failed to parse Option-Value as file-path or URL: " + optionValue);
+ }
+
+ @Override
+ public CoNLLRDFFormatter buildFromJsonConf(ObjectNode conf) throws IOException {
+ CoNLLRDFFormatter formatter = new CoNLLRDFFormatter();
+
+ if (conf.path("output").isTextual()) {
+ PrintStream output = parseConfAsOutputStream(conf.get("output").asText());
+ formatter.setOutputStream(output);
+ }
+ for (JsonNode modConf : conf.withArray("modules")) {
+ addModule(formatter, modConf);
+ }
+ if (formatter.getModules().size() == 0) {
+ formatter.addModule(Mode.CONLLRDF);
+ }
+ return formatter;
+ }
+
+ private Module addModule(CoNLLRDFFormatter formatter, JsonNode modConf)
+ throws IOException {
+ ObjectMapper mapper = new ObjectMapper();
+
+ Mode mode;
+ JsonNode columnsArray = null;
+ String select = "";
+ PrintStream outputStream = null;
+ String modeString = modConf.get("mode").asText();
+ switch (modeString) {
+ case "RDF":
+ case "CONLLRDF":
+ mode = Mode.CONLLRDF;
+ columnsArray = modConf.withArray("columns");
+ break;
+ case "CONLL":
+ mode = Mode.CONLL;
+ columnsArray = modConf.withArray("columns");
+ break;
+ case "DEBUG":
+ mode = Mode.DEBUG;
+ outputStream = System.err;
+ break;
+ case "SPARQLTSV":
+ LOG.warn("Mode SPARQLTSV is deprecated, please use QUERY instead.");
+ case "QUERY":
+ mode = Mode.QUERY;
+ // TODO check URI
+ select = readString(Paths.get(modConf.get("select").asText()));
+ // TODO Attach context to IOExceptions thrown by readString
+ break;
+ case "GRAMMAR":
+ mode = Mode.GRAMMAR;
+ break;
+ case "SEMANTICS":
+ mode = Mode.SEMANTICS;
+ break;
+ case "GRAMMAR+SEMANTICS":
+ mode = Mode.GRAMMAR_SEMANTICS;
+ break;
+
+ default:
+ throw new IllegalArgumentException("Unknown mode: " + modeString);
+ }
+ Module module = formatter.addModule(mode);
+
+ // select is either "" or a selectQuery as String
+ module.setSelect(select);
+ // convert JSON array to Java List
+ if (columnsArray != null) {
+ List columnList = mapper.convertValue(columnsArray, new TypeReference>() {});
+ module.setCols(columnList);
+ }
+ // Set outputStream, if config has a property "output"
+ if (modConf.path("output").isTextual()) {
+ outputStream = parseConfAsOutputStream(modConf.get("output").asText());
+ }
+ // outputStream can be null or System.err
+ module.setOutputStream(outputStream);
+ return module;
+ }
+}
diff --git a/src/main/java/org/acoli/conll/rdf/CoNLLRDFManager.java b/src/main/java/org/acoli/conll/rdf/CoNLLRDFManager.java
index 23f1b40..67ae092 100644
--- a/src/main/java/org/acoli/conll/rdf/CoNLLRDFManager.java
+++ b/src/main/java/org/acoli/conll/rdf/CoNLLRDFManager.java
@@ -1,12 +1,9 @@
package org.acoli.conll.rdf;
-import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
-import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
-import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PipedInputStream;
import java.io.PipedOutputStream;
@@ -25,11 +22,14 @@
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
+import org.acoli.fintan.core.FintanManager;
+import org.acoli.fintan.core.FintanStreamComponent;
+import org.acoli.fintan.core.FintanStreamComponentFactory;
-public class CoNLLRDFManager {
+public class CoNLLRDFManager extends FintanManager {
static Logger LOG = LogManager.getLogger(CoNLLRDFManager.class);
- static Map> classFactoryMap;
+ static Map> classFactoryMap;
static {
classFactoryMap = new HashMap<>();
classFactoryMap.put(CoNLLStreamExtractor.class.getSimpleName(), () -> new CoNLLStreamExtractorFactory());
@@ -43,7 +43,7 @@ public class CoNLLRDFManager {
private OutputStream output;
private JsonNode[] pipeline;
private JsonNode config;
- private ArrayList componentStack = new ArrayList();
+ private ArrayList componentStack = new ArrayList();
public InputStream getInput() {
return input;
@@ -77,26 +77,24 @@ public void setConfig(JsonNode config) {
this.config = config;
}
- ArrayList getComponentStack() {
+ ArrayList getComponentStack() {
return componentStack;
}
- void setComponentStack(ArrayList componentStack) {
+ void setComponentStack(ArrayList componentStack) {
this.componentStack = componentStack;
- }
+ }
public static void main(String[] args) throws IOException {
- final CoNLLRDFManager manager;
try {
- manager = new CoNLLRDFManagerFactory().buildFromCLI(args);
- manager.buildComponentStack();
- } catch (ParseException e) {
+ FintanManager.main(args);
+ } catch (IOException e) {
+ throw e;
+ }catch (Exception e) {
LOG.error(e);
System.exit(1);
return;
}
-
- manager.start();
}
protected static InputStream parseConfAsInputStream(String confEntry) throws IOException {
@@ -133,7 +131,7 @@ protected static PrintStream parseConfAsOutputStream(String confEntry) throws IO
return output;
}
- public void buildComponentStack() throws IOException, ParseException {
+ public void buildComponentStack() throws IOException {
//READ PIPELINE PARAMETER
/*
JsonNode pipelineNode = config.get("pipeline");
@@ -149,21 +147,21 @@ public void buildComponentStack() throws IOException, ParseException {
linkComponents(componentStack, input, output);
}
- static ArrayList parsePipeline(Iterable pipelineArray) throws IOException, ParseException {
- ArrayList componentArray = new ArrayList<>();
+ static ArrayList parsePipeline(Iterable pipelineArray) throws IOException, ParseException {
+ ArrayList componentArray = new ArrayList<>();
for (JsonNode pipelineElement:pipelineArray) {
if (!pipelineElement.getNodeType().equals(JsonNodeType.OBJECT)) {
throw new IllegalArgumentException("Elements of \"pipeline\" have to be obejct-type");
}
- // Create CoNLLRDFComponents (StreamExtractor, Updater, Formatter ...)
+ // Create FintanStreamComponents (StreamExtractor, Updater, Formatter ...)
String className = pipelineElement.required("class").asText();
if (!classFactoryMap.containsKey(className)) {
throw new IllegalArgumentException( "Unknown class: " + className);
}
- CoNLLRDFComponent component = classFactoryMap.get(className).get().buildFromJsonConf((ObjectNode) pipelineElement);
+ FintanStreamComponent component = classFactoryMap.get(className).get().buildFromJsonConf((ObjectNode) pipelineElement);
componentArray.add(component);
}
return componentArray;
@@ -175,9 +173,9 @@ static ArrayList parsePipeline(Iterable pipelineArr
* @param input Link this to the first component
* @param output Link last component to this.
*/
- static void linkComponents(List componentArray, InputStream input, OutputStream output) throws IOException {
- CoNLLRDFComponent prevComponent = null;
- for (CoNLLRDFComponent component : componentArray) {
+ static void linkComponents(List componentArray, InputStream input, OutputStream output) throws IOException {
+ FintanStreamComponent prevComponent = null;
+ for (FintanStreamComponent component : componentArray) {
if (prevComponent == null) {
// link input to first component
component.setInputStream(input);
@@ -196,7 +194,7 @@ static void linkComponents(List componentArray, InputStream i
}
public void start() {
- for (CoNLLRDFComponent component:componentStack) {
+ for (FintanStreamComponent component:componentStack) {
Thread t = new Thread(component);
t.start();
}
diff --git a/src/main/java/org/acoli/conll/rdf/CoNLLRDFUpdater.java b/src/main/java/org/acoli/conll/rdf/CoNLLRDFUpdater.java
index 9a385cf..6446fba 100644
--- a/src/main/java/org/acoli/conll/rdf/CoNLLRDFUpdater.java
+++ b/src/main/java/org/acoli/conll/rdf/CoNLLRDFUpdater.java
@@ -15,19 +15,17 @@
*/
package org.acoli.conll.rdf;
-import static org.acoli.conll.rdf.CoNLLRDFCommandLine.*;
+import static org.acoli.conll.rdf.CoNLLRDFCommandLine.readString;
+import static org.acoli.conll.rdf.CoNLLRDFCommandLine.readUrl;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
-import java.io.PrintStream;
import java.io.StringReader;
import java.io.StringWriter;
-import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URI;
@@ -39,8 +37,11 @@
import java.util.Iterator;
import java.util.List;
import java.util.UUID;
-import java.util.zip.GZIPInputStream;
+import org.acoli.fintan.core.FintanStreamHandler;
+import org.acoli.fintan.load.RDFStreamLoader;
+import org.acoli.fintan.rdf.RDFUpdater;
+import org.acoli.fintan.write.RDFStreamWriter;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.ImmutableTriple;
@@ -65,7 +66,7 @@
* @author Christian Chiarcos {@literal chiarcos@informatik.uni-frankfurt.de}
* @author Christian Faeth {@literal faeth@em.uni-frankfurt.de}
*/
-public class CoNLLRDFUpdater extends CoNLLRDFComponent {
+public class CoNLLRDFUpdater extends RDFUpdater {
static final Logger LOG = LogManager.getLogger(CoNLLRDFUpdater.class);
private final Dataset dataset;
@@ -94,7 +95,7 @@ public class CoNLLRDFUpdater extends CoNLLRDFComponent {
private final List sentBufferLookahead = Collections.synchronizedList(new ArrayList());
private final List sentBufferLookback = Collections.synchronizedList(new ArrayList());
// Buffer for outputting sentences in original order
- private final List sentBufferOut = Collections.synchronizedList(new ArrayList());
+ private final List sentBufferOut = Collections.synchronizedList(new ArrayList());
//for statistics
private final List>> dRTs = Collections.synchronizedList(new ArrayList>>());
@@ -102,18 +103,17 @@ public class CoNLLRDFUpdater extends CoNLLRDFComponent {
private class UpdateThread extends Thread {
-
private CoNLLRDFUpdater updater;
private int threadID;
private Dataset memDataset;
-
+
/**
* Each UpdateThread receives its own ID and a back-reference to the calling Updater.
- *
+ *
* In the current implementation, each thread manages its own in-memory Dataset.
* This is the fastest approach since no concurring access on a single Datasets occurs.
* However: lots of RAM may be needed.
- *
+ *
* @param updater
* The calling Updater (= ThreadHandler)
* @param id
@@ -131,7 +131,7 @@ public UpdateThread(CoNLLRDFUpdater updater, int id) {
memDataset.addNamedModel("https://github.com/acoli-repo/conll-rdf/lookback", ModelFactory.createDefaultModel());
memDataset.addNamedModel("https://github.com/acoli-repo/conll-rdf/lookahead", ModelFactory.createDefaultModel());
}
-
+
/**
* Run the update thread.
* Load the buffer, execute the updates with all iterations and graphsout, unload the buffer.
@@ -141,11 +141,17 @@ public void run() {
//Execute Thread
LOG.trace("NOW Processing on thread "+threadID+": outputbuffersize "+sentBufferOut.size());
- Triple, String, List> sentBufferThread = sentBufferThreads.get(threadID);
+
+ // unpack triple for better readability of code
+ final Triple, String, List> sentBufferThread = sentBufferThreads.get(threadID);
+ final List lookbackSentenceList = sentBufferThread.getLeft();
+ final String currentSentence = sentBufferThread.getMiddle();
+ final List lookaheadSentenceList = sentBufferThread.getRight();
+
StringWriter out = new StringWriter();
try {
- loadBuffer(sentBufferThread);
-
+ loadBuffer(lookbackSentenceList, currentSentence, lookaheadSentenceList);
+
List > ret = executeUpdates(updates);
if (dRTs.get(threadID).isEmpty())
dRTs.get(threadID).addAll(ret);
@@ -154,8 +160,8 @@ public void run() {
dRTs.get(threadID).set(x, new ImmutablePair(
dRTs.get(threadID).get(x).getKey() + ret.get(x).getKey(),
dRTs.get(threadID).get(x).getValue() + ret.get(x).getValue()));
-
- unloadBuffer(sentBufferThread, out);
+
+ unloadBuffer(currentSentence, out);
} catch (Exception e) {
// memDataset.begin(ReadWrite.WRITE);
memDataset.getDefaultModel().removeAll();
@@ -175,12 +181,11 @@ public void run() {
sentBufferOut.set(i, out.toString());
break;
}
- }
-
+ }
+
//go to sleep and let Updater take control
LOG.trace("Updater notified by "+threadID);
updater.notify();
-
}
try {
synchronized (this) {
@@ -193,44 +198,44 @@ public void run() {
}
}
}
-
+
/**
* Loads Data to this thread's working model.
- * @param buffer
- * the model to be read.
- * @throws Exception
*/
- private void loadBuffer(Triple, String, List> sentBufferThread) throws Exception { //TODO: adjust for TXN-Models
- //check validity of current sentence
- isValidUTF8(sentBufferThread.getMiddle(), "Input data encoding issue for \"" + sentBufferThread.getMiddle() + "\"");
- //load ALL
+ private void loadBuffer(List lookbackSentenceList, String currentSentence,
+ List lookaheadSentenceList) throws Exception {
+ // TODO: adjust for TXN-Models
+ // check validity of current sentence
+
+ // load ALL
try {
-// memDataset.begin(ReadWrite.WRITE);
-
+ // memDataset.begin(ReadWrite.WRITE);
+
// for lookback
- for (String sent:sentBufferThread.getLeft()) {
- memDataset.getNamedModel("https://github.com/acoli-repo/conll-rdf/lookback").read(new StringReader(sent),null, "TTL");
+ for (String sentence : lookbackSentenceList) {
+ memDataset.getNamedModel("https://github.com/acoli-repo/conll-rdf/lookback")
+ .read(new StringReader(sentence), null, "TTL");
}
-
+
// for current sentence
- memDataset.getDefaultModel().read(new StringReader(sentBufferThread.getMiddle()),null, "TTL");
+ memDataset.getDefaultModel().read(new StringReader(currentSentence), null, "TTL");
// for lookahead
- for (String sent:sentBufferThread.getRight()) {
- memDataset.getNamedModel("https://github.com/acoli-repo/conll-rdf/lookahead").read(new StringReader(sent),null, "TTL");
+ for (String sentence : lookaheadSentenceList) {
+ memDataset.getNamedModel("https://github.com/acoli-repo/conll-rdf/lookahead")
+ .read(new StringReader(sentence), null, "TTL");
}
-
-// memDataset.commit();
-// Model m = ModelFactory.createDefaultModel().read(new StringReader(buffer),null, "TTL");
-// memAccessor.add(m);
-// memDataset.getDefaultModel().setNsPrefixes(m.getNsPrefixMap());
+
+ // memDataset.commit();
+ // Model m = ModelFactory.createDefaultModel().read(new StringReader(buffer),null, "TTL");
+ // memAccessor.add(m);
+ // memDataset.getDefaultModel().setNsPrefixes(m.getNsPrefixMap());
} catch (Exception ex) {
- LOG.error("Exception while reading: " + sentBufferThread.getMiddle());
+ LOG.error("Exception while reading: " + currentSentence);
throw ex;
} finally {
-// memDataset.end();
+ // memDataset.end();
}
-
}
/**
@@ -242,35 +247,35 @@ private void loadBuffer(Triple, String, List> sentBufferThr
* Output Writer.
* @throws Exception
*/
- private void unloadBuffer(Triple, String, List> sentBufferThread, Writer out) throws Exception { //TODO: adjust for TXN-Models
- String buffer = sentBufferThread.getMiddle();
+ private void unloadBuffer(String currentSentence, Writer out)
+ throws Exception { // TODO: adjust for TXN-Models
try {
- BufferedReader in = new BufferedReader(new StringReader(buffer));
+ BufferedReader in = new BufferedReader(new StringReader(currentSentence));
String line;
- while((line=in.readLine())!=null) {
- line=line.trim();
- if(line.startsWith("#")) out.write(line+"\n");
+ while ((line = in.readLine()) != null) {
+ line = line.trim();
+ if (line.startsWith("#"))
+ out.write(line + "\n");
}
memDataset.getDefaultModel().write(out, "TTL");
out.write("\n");
out.flush();
} catch (Exception ex) {
-// memDataset.abort();
- LOG.error("Exception while unloading: " + buffer);
+ // memDataset.abort();
+ LOG.error("Exception while unloading: " + currentSentence);
} finally {
-// memDataset.begin(ReadWrite.WRITE);
+ // memDataset.begin(ReadWrite.WRITE);
memDataset.getDefaultModel().removeAll();
memDataset.getNamedModel("https://github.com/acoli-repo/conll-rdf/lookback").removeAll();
memDataset.getNamedModel("https://github.com/acoli-repo/conll-rdf/lookahead").removeAll();
-// memDataset.commit();
-// memDataset.end();
+ // memDataset.commit();
+ // memDataset.end();
}
-
}
-
+
/**
* Executes updates on this thread. Data must be preloaded first.
- *
+ *
* @param updates
* The updates as a List of Triples containing
* - update filename
@@ -281,7 +286,7 @@ private void unloadBuffer(Triple, String, List> sentBufferT
* - total no. of iterations
* - total time
*/
- private List> executeUpdates(List> updates) {
+ private List> executeUpdates(List> updates) {
String sent = new String();
boolean graphsout = false;
@@ -289,7 +294,7 @@ private List> executeUpdates(List> executeUpdates(List> executeUpdates(List(v, System.currentTimeMillis() - startTime));
defaultModel.unregister(cL);
upd_id++;
- }
+ }
return result;
}
-
+
/**
* Produce dotFile for a specific update iteration.
- *
+ *
* @param m
* The current model.
* @param updateSrc
@@ -420,7 +425,7 @@ private void produceDot(Model m, String updateSrc, String updateQuery, String se
if (graphOutputDir != null) {
String updateName = (new File(updateSrc)).getName();
updateName = (updateName != null && !updateName.isEmpty()) ? updateName : UUID.randomUUID().toString();
-
+
File outputFile = new File(graphOutputDir, sent
+"__U"+String.format("%03d", upd_id)
+"_I" +String.format("%04d", iter_id)
@@ -428,12 +433,12 @@ private void produceDot(Model m, String updateSrc, String updateQuery, String se
+"__" +updateName.replace(".sparql", "")+".dot");
Writer w = new OutputStreamWriter(new FileOutputStream(outputFile), StandardCharsets.UTF_8);
CoNLLRDFViz.produceDot(m, w, updateQuery);
- }
+ }
}
-
+
/**
* Produce lexicographically sorted ntriples-file for a specific update iteration.
- *
+ *
* @param m
* The current model.
* @param updateSrc
@@ -454,7 +459,7 @@ private void produceNTRIPLES(Model m, String updateSrc, String updateQuery, Stri
if (triplesOutputDir != null) {
String updateName = (new File(updateSrc)).getName();
updateName = (updateName != null && !updateName.isEmpty()) ? updateName : UUID.randomUUID().toString();
-
+
File outputFile = new File(triplesOutputDir, sent
+"__U"+String.format("%03d", upd_id)
+"_I" +String.format("%04d", iter_id)
@@ -473,7 +478,7 @@ private void produceNTRIPLES(Model m, String updateSrc, String updateQuery, Stri
}
out.flush();
out.close();
- }
+ }
}
}
@@ -483,19 +488,19 @@ private void produceNTRIPLES(Model m, String updateSrc, String updateQuery, Stri
public CoNLLRDFUpdater() {
this("", "", 0);
}
-
+
/**
* Standard Constructor for Updater. Creates Threads and Buffers for Thread handling.
* Also creates the database modules for the respective execution modes.
* @param type: The type of database to be used:
- * MEM: fully independent in-memory datasets per thread
+ * MEM: fully independent in-memory datasets per thread
* (fastest, no transactions, high RAM usage, no HDD)
* TXN: single transactional in-memory dataset for all threads
* (in development, medium speed and RAM, no HDD)
* TDB2: single transactional TDB2-database for all threads
* (in development, slow-medium speed, low RAM usage, high HDD usage)
* default: MEM
- * @param path:
+ * @param path:
* path to database (only for TDB2 or other DB-backed modes)
* @param threads
* Maximum amount of threads for execution.
@@ -629,9 +634,9 @@ public boolean getPrefixDeduplication() {
}
/**
- * Load external RDF file into a named graph of the local dataset.
+ * Load external RDF file into a named graph of the local dataset.
* This graph is permanent for the runtime and is accessed read-only by all threads.
- * The default graph of the local dataset is reserved for updating nif:Sentences and
+ * The default graph of the local dataset is reserved for updating nif:Sentences and
* can not be defined here.
* @param url
* location of the RDF file to be loaded
@@ -651,7 +656,7 @@ public void loadGraph(URI url, URI graph) throws IOException {
}
Model m = ModelFactory.createDefaultModel();
try {
- m.read(readInURI(url));
+ m.read(CoNLLRDFUtil.readInURI(url));
dataset.addNamedModel(graph.toString(), m);
} catch (IOException ex) {
LOG.error("Exception while reading " + url + " into " + graph);
@@ -704,7 +709,7 @@ public void parseUpdates(List> updatesRaw) throws
updateScript = readUrl(url);
} catch (MalformedURLException e) {
LOG.trace(e);
- LOG.debug("Update is not a valid URL " + updateScriptRaw); // this occurs if the update is verbatim
+ LOG.debug("Update is not a valid URL {}", updateScriptRaw); // this occurs if the update is verbatim
} catch (IOException e) {
throw new IOException("Failed to open input stream from URL " + updateScriptRaw, e);
}
@@ -747,150 +752,34 @@ public void parseUpdates(List> updatesRaw) throws
updates.addAll(Collections.synchronizedList(updatesOut));
}
- /**
- * Tries to read from a specific URI.
- * Tries to read content directly or from GZIP
- * Validates content against UTF-8.
- * @param uri
- * the URI to be read
- * @return
- * the text content
- * @throws MalformedURLException
- * @throws IOException
- */
- private static String readInURI(URI uri) throws MalformedURLException, IOException {
- String result = null;
- try {
- result = uri.toString();
- if (result != null && result.endsWith(".gz")) {
- StringBuilder sb = new StringBuilder();
- BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(uri.toURL().openStream())));
- for (String line; (line = br.readLine()) != null; sb.append(line));
- result = sb.toString();
- isValidUTF8(result, "Given URI input (" + uri.getPath() + ") is not UTF-8 encoded");
- }
- } catch (Exception ex) {
- LOG.error("Excpetion while reading " + uri.getPath());
- throw ex;
- }
- return result;
- }
-
- private static void isValidUTF8(String s, String message) {
- try
- {
- s.getBytes("UTF-8");
- }
- catch (UnsupportedEncodingException e)
- {
- LOG.error(message + " - Encoding error: " + e.getMessage());
- System.exit(-1);
- }
- }
-
/**
* Processes CoNLL-RDF on the local dataset using the predfined updates and threads.
- * Streams data from a buffered reader to a buffered writer. Distributes the processing
+ * Streams data from a buffered reader to a buffered writer. Distributes the processing
* across available threads. Each thread handles one sentence at a time.
* Caches and outputs the resulting sentences in-order.
* @throws IOException
+ * @throws InterruptedException
*/
- @Override
- protected void processSentenceStream() throws IOException {
+ // FIXME @Override
+ protected void processSentenceStream() throws IOException, InterruptedException {
initThreads();
running = true;
- BufferedReader in = new BufferedReader(new InputStreamReader(getInputStream()));
- PrintStream out = new PrintStream(getOutputStream());
-
- String prefixCache = new String();
- String line;
- String lastLine ="";
- String buffer="";
-// List > dRTs = new ArrayList >(); // iterations and execution time of each update in seconds
- while((line = in.readLine())!=null) {
- line=line.replaceAll("[\t ]+"," ").trim();
-
- if(!buffer.trim().equals("") && (line.startsWith("@") || line.startsWith("#")) && !lastLine.startsWith("@") && !lastLine.startsWith("#")) { //!buffer.matches("@[^\n]*\n?$")) {
- // If the buffer is not empty and the current line starts with @ or #
- // and the previous line did not start with @ or #
- // check if the buffer contains a ttl prefix
- if (buffer.contains("@prefix")) {
- prefixCache = new String();
- for (String buffLine:buffer.split("\n")) {
- if (buffLine.trim().startsWith("@prefix")) {
- prefixCache += buffLine+"\n";
- }
- }
- } else {
- buffer = prefixCache+buffer;
- }
-
- // GRAPH OUTPUT determine first sentence's id, if none were specified
- if ((graphOutputDir != null) && (graphOutputSentences.isEmpty())) {
- String sentID = readFirstSentenceID(buffer);
- graphOutputSentences.add(sentID);
- LOG.debug("Graph Output defaults to first sentence: " + sentID);
- }
- // TRIPLES OUTPUT determine first sentence's id, if none were specified
- if ((triplesOutputDir != null) && (triplesOutputSentences.isEmpty())) {
- String sentID = readFirstSentenceID(buffer);
- triplesOutputSentences.add(sentID);
- LOG.debug("Triples Output defaults to first sentence: " + sentID);
- }
-
- //lookahead
- //add ALL sentences to sentBufferLookahead
- sentBufferLookahead.add(buffer);
- if (sentBufferLookahead.size() > lookahead_snts) {
- //READY TO PROCESS
- // remove first sentence from buffer and process it.
- // !!if lookahead = 0 then only current buffer is in sentBufferLookahead!!
- executeThread(sentBufferLookahead.remove(0));
- }
-
- //lookback
- //needs to consider lookahead buffer. The full buffer size needs to be lookahead + lookback.
- if (lookback_snts > 0) {
- while (sentBufferLookback.size() >= lookback_snts + sentBufferLookahead.size()) sentBufferLookback.remove(0);
- sentBufferLookback.add(buffer);
- }
+ Model model;
+ String prefixCache = "";
- flushOutputBuffer(out);
- buffer="";
- }
- buffer=buffer+line+"\n";
- lastLine=line;
- }
-
- // FINAL SENTENCE (with prefixes if necessary)
- if (!buffer.contains("@prefix")) {
- buffer = prefixCache+buffer;
- }
-
- // To address the edge case of no comments or prefixes occuring after the first sentence of a stream
- // GRAPH OUTPUT determine first sentence's id, if none were specified
- if ((graphOutputDir != null) && (graphOutputSentences.isEmpty())) {
- String sentID = readFirstSentenceID(buffer);
- graphOutputSentences.add(sentID);
- LOG.debug("Graph Output defaults to first sentence: " + sentID);
- }
- // TRIPLES OUTPUT determine first sentence's id, if none were specified
- if ((triplesOutputDir != null) && (triplesOutputSentences.isEmpty())) {
- String sentID = readFirstSentenceID(buffer);
- triplesOutputSentences.add(sentID);
- LOG.debug("Triples Output defaults to first sentence: " + sentID);
+ while((model = getInputStream().read()) != null) {
+ prefixCache = processModel(prefixCache, model);
+ flushOutputBuffer();
}
// LOOKAHEAD work down remaining buffer
- sentBufferLookahead.add(buffer);
while (sentBufferLookahead.size()>0) {
- executeThread(sentBufferLookahead.remove(0));
+ executeThreadWithLookaround(sentBufferLookahead.remove(0));
if (lookback_snts > 0) {
while (sentBufferLookback.size() >= lookback_snts + sentBufferLookahead.size()) sentBufferLookback.remove(0);
}
}
-
-
+
//wait for threads to finish work
boolean threadsRunning = true;
while(threadsRunning) {
@@ -914,7 +803,7 @@ protected void processSentenceStream() throws IOException {
}
}
}
-
+
//sum up statistics
List> dRTs_sum = new ArrayList >();
for (List> dRT_thread:dRTs) {
@@ -925,26 +814,76 @@ protected void processSentenceStream() throws IOException {
dRTs_sum.set(x, new ImmutablePair(
dRTs_sum.get(x).getKey() + dRT_thread.get(x).getKey(),
dRTs_sum.get(x).getValue() + dRT_thread.get(x).getValue()));
-
+
}
if (!dRTs_sum.isEmpty())
LOG.debug("Done - List of iterations and execution times for the updates done (in given order):\n\t\t" + dRTs_sum.toString());
//final flush
- flushOutputBuffer(out);
- getOutputStream().close();
-
+ flushOutputBuffer();
+ getOutputStream().terminate();
+ }
+
+ private String processModel(String prefixCache, Model model) {
+ //!buffer.matches("@[^\n]*\n?$")) {
+ String buffer = CoNLLRDFUtil.conllRdfModel2String(model);
+ // If the buffer is not empty and the current line starts with @ or #
+ // and the previous line did not start with @ or #
+ // check if the buffer contains a ttl prefix
+
+ // Map prefixMap = model.getNsPrefixMap();
+ if (buffer.contains("@prefix")) {
+ prefixCache = "";
+ for (String buffLine:buffer.split("\n")) {
+ if (buffLine.trim().startsWith("@prefix")) {
+ prefixCache += buffLine+"\n";
+ }
+ }
+ } else {
+ buffer = prefixCache+buffer;
+ }
+
+ // GRAPH OUTPUT determine first sentence's id, if none were specified
+ if ((graphOutputDir != null) && (graphOutputSentences.isEmpty())) {
+ String sentID = readFirstSentenceID(model);
+ graphOutputSentences.add(sentID);
+ LOG.debug("Graph Output defaults to first sentence: {}", sentID);
+ }
+ // TRIPLES OUTPUT determine first sentence's id, if none were specified
+ if ((triplesOutputDir != null) && (triplesOutputSentences.isEmpty())) {
+ String sentID = readFirstSentenceID(model);
+ triplesOutputSentences.add(sentID);
+ LOG.debug("Triples Output defaults to first sentence: {}", sentID);
+ }
+
+ //lookahead
+ //add ALL sentences to sentBufferLookahead
+ sentBufferLookahead.add(buffer);
+ if (sentBufferLookahead.size() > lookahead_snts) {
+ //READY TO PROCESS
+ // remove first sentence from buffer and process it.
+ // !!if lookahead = 0 then only current buffer is in sentBufferLookahead!!
+ executeThreadWithLookaround(sentBufferLookahead.remove(0));
+ }
+
+ //lookback
+ //needs to consider lookahead buffer. The full buffer size needs to be lookahead + lookback.
+ if (lookback_snts > 0) {
+ while (sentBufferLookback.size() >= lookback_snts + sentBufferLookahead.size()) sentBufferLookback.remove(0);
+ sentBufferLookback.add(buffer);
+ }
+
+ return prefixCache;
}
/**
- * Retrieve the first "Sentence ID" (nif-core#Sentence -property) from the buffer and return it
+ * Retrieve the first "Sentence ID" (nif-core#Sentence -property) from the model and return it
*/
- private String readFirstSentenceID(String buffer) {
- Model m = ModelFactory.createDefaultModel();
- String sentID = m.read(new StringReader(buffer),null, "TTL").listSubjectsWithProperty(
- m.getProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"),
- m.getProperty("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#Sentence")
- ).next().getLocalName();
+ private String readFirstSentenceID(Model model) {
+ String sentID = model
+ .listResourcesWithProperty(model.getProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"),
+ model.getProperty("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#Sentence"))
+ .next().getLocalName();
return sentID;
}
@@ -964,18 +903,18 @@ private void initThreads() {
}
}
- private synchronized void flushOutputBuffer(PrintStream out) {
- LOG.trace("OutBufferSize: "+sentBufferOut.size());
+ private synchronized void flushOutputBuffer() throws InterruptedException {
+ LOG.trace("OutBufferSize: " + sentBufferOut.size());
String prefixCacheOut = new String();
- while (!sentBufferOut.isEmpty()) {
- if (sentBufferOut.get(0).matches("\\d+")) break;
-
+ for (String buffer : sentBufferOut) {
+ if (buffer.matches("\\d+")) break;
+
String outString = new String();
if (prefixDeduplication) {
String prefixCacheTMP = new String();
- for (String buffLine:sentBufferOut.remove(0).split("\n")) {
+ for (String buffLine:buffer.split("\n")) {
if (buffLine.trim().startsWith("@prefix")) {
prefixCacheTMP += buffLine+"\n";
} else if (!buffLine.trim().isEmpty()) {
@@ -987,25 +926,25 @@ private synchronized void flushOutputBuffer(PrintStream out) {
outString = prefixCacheTMP + outString + "\n";
}
} else {
- outString = sentBufferOut.remove(0);
+ outString = buffer;
}
if (!outString.endsWith("\n\n")) outString += "\n";
- out.print(outString);
+ getOutputStream().write(ModelFactory.createDefaultModel().read(new StringReader(outString), null, "TTL"));
}
}
- private void executeThread(String buffer) {
- MutableTriple, String, List>sentBufferThread =
- new MutableTriple, String, List>(
- new ArrayList(), new String(), new ArrayList());
+ private void executeThreadWithLookaround(String buffer) {
//sentBufferLookback only needs to be filled up to the current sentence.
//All other sentences are for further lookahead iterations
-// sentBufferThread.getLeft().addAll(sentBufferLookback);
+ ArrayList reducedSentenceBufferLookback = new ArrayList<>();
for (int i = 0; i < sentBufferLookback.size() - sentBufferLookahead.size(); i++) {
- sentBufferThread.getLeft().add(sentBufferLookback.get(i));
+ reducedSentenceBufferLookback.add(sentBufferLookback.get(i));
}
- sentBufferThread.setMiddle(buffer);
- sentBufferThread.getRight().addAll(sentBufferLookahead);
+ executeThread(reducedSentenceBufferLookback, buffer, sentBufferLookahead);
+ }
+
+ private void executeThread(List lookback, String buffer, List lookahead) {
+ MutableTriple, String, List> sentBufferThread = new MutableTriple<>(lookback, buffer, lookahead);
int i = 0;
while(i < updateThreads.size()) {
@@ -1018,7 +957,7 @@ private void executeThread(String buffer) {
LOG.trace("restart "+i);
LOG.trace("OutBufferSize: "+sentBufferOut.size());
break;
- } else
+ } else
if (updateThreads.get(i).getState() == Thread.State.WAITING) {
synchronized(updateThreads.get(i)) {
sentBufferThreads.set(i, sentBufferThread);
@@ -1027,7 +966,7 @@ private void executeThread(String buffer) {
}
LOG.trace("wake up "+i);
break;
- } else
+ } else
if (updateThreads.get(i).getState() == Thread.State.NEW) {
sentBufferThreads.set(i, sentBufferThread);
sentBufferOut.add(String.valueOf(i)); //add last sentences to the end of the output queue.
@@ -1035,7 +974,7 @@ private void executeThread(String buffer) {
LOG.trace("start "+i);
LOG.trace("OutBufferSize: "+sentBufferOut.size());
break;
- } else
+ } else
if (updateThreads.get(i).getState() == Thread.State.TERMINATED) {
sentBufferThreads.set(i, sentBufferThread);
sentBufferOut.add(String.valueOf(i)); //add last sentences to the end of the output queue.
@@ -1045,7 +984,7 @@ private void executeThread(String buffer) {
LOG.trace("OutBufferSize: "+sentBufferOut.size());
break;
}
-
+
i++;
if (i >= updateThreads.size()) {
try {
@@ -1064,18 +1003,25 @@ private void executeThread(String buffer) {
public static void main(String[] args) throws IOException {
final CoNLLRDFUpdater updater;
+ final FintanStreamHandler inStream = new FintanStreamHandler();
+ final FintanStreamHandler outStream = new FintanStreamHandler();
+ final RDFStreamLoader streamLoader = new RDFStreamLoader();
+ final RDFStreamWriter streamWriter = new RDFStreamWriter();
try {
updater = new CoNLLRDFUpdaterFactory().buildFromCLI(args);
- updater.setInputStream(System.in);
- updater.setOutputStream(System.out);
+ streamLoader.setInputStream(System.in);
+ streamLoader.setOutputStream(inStream);
+ updater.setInputStream(inStream);
+ updater.setOutputStream(outStream);
+ streamWriter.setInputStream(outStream);
+ streamWriter.setOutputStream(System.out);
} catch (ParseException e) {
LOG.error(e);
System.exit(1);
return;
}
- long start = System.currentTimeMillis();
- // READ SENTENCES from System.in
- updater.processSentenceStream();
- LOG.debug((System.currentTimeMillis()-start)/1000 + " seconds");
+ new Thread(streamLoader).start();
+ new Thread(updater).start();
+ new Thread(streamWriter).start();
}
}
diff --git a/src/main/java/org/acoli/conll/rdf/CoNLLRDFUpdaterFactory.java b/src/main/java/org/acoli/conll/rdf/CoNLLRDFUpdaterFactory.java
index d579b5a..c0a0766 100644
--- a/src/main/java/org/acoli/conll/rdf/CoNLLRDFUpdaterFactory.java
+++ b/src/main/java/org/acoli/conll/rdf/CoNLLRDFUpdaterFactory.java
@@ -1,208 +1,209 @@
-package org.acoli.conll.rdf;
-
-import static org.acoli.conll.rdf.CoNLLRDFCommandLine.parseUpdate;
-import java.io.IOException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.node.ObjectNode;
-
-import org.apache.commons.cli.*;
-import org.apache.commons.lang3.tuple.ImmutableTriple;
-import org.apache.commons.lang3.tuple.Pair;
-import org.apache.commons.lang3.tuple.Triple;
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-
-public class CoNLLRDFUpdaterFactory extends CoNLLRDFComponentFactory {
- static Logger LOG = LogManager.getLogger(CoNLLRDFUpdaterFactory.class);
- @Override
- public CoNLLRDFUpdater buildFromCLI(String[] args) throws IOException, ParseException {
- CoNLLRDFUpdater updater = new CoNLLRDFUpdater();
- final CommandLine cmd = new CoNLLRDFCommandLine(
- "CoNLLRDFUpdater [-loglevel LEVEL] [-threads T] [-lookahead N] [-lookback N] [-custom [-model URI [GRAPH]]* [-graphsout DIR [SENT_ID ...]] [-triplesout DIR [SENT_ID ...]] -updates [UPDATE ...]]",
- "read TTL from stdin => update CoNLL-RDF", new Option[] {
- // Define cli options in the correct order for the help-message
- Option.builder("loglevel").hasArg().desc("set log level to LEVEL").argName("level").build(),
- Option.builder("threads").hasArg()
- .desc("use T threads max\ndefault: half of available logical processor cores")
- .type(Number.class).build(),
- Option.builder("lookahead").hasArg().desc("cache N further sentences in lookahead graph")
- .type(Number.class).build(),
- Option.builder("lookback").hasArg().desc("cache N preceeding sentences in lookback graph")
- .type(Number.class).build(),
- new Option("prefixDeduplication", false, "Remove duplicates of TTL-Prefixes"),
- Option.builder("custom").hasArg(false).desc("use custom update scripts")
- ./* required(). */build(),
- Option.builder("model").hasArgs().desc("to load additional Models into local graph").build(),
- Option.builder("graphsout").hasArgs().desc(
- "output directory for the .dot graph files\nfollowed by the IDs of the sentences to be visualized\ndefault: first sentence only")
- .build(),
- Option.builder("triplesout").hasArgs()
- .desc("same as graphsout but write N-TRIPLES for text debug instead.").build(),
- Option.builder("updates").hasArgs()
- .desc("followed by SPARQL scripts paired with {iterations/u}").build() },
- CoNLLRDFUpdater.LOG).parseArgs(args);
-
- if (cmd.hasOption("threads")) {
- updater.setThreads(((Number) cmd.getParsedOptionValue("threads")).intValue());
- }
- if (cmd.hasOption("lookahead")) {
- updater.activateLookahead(((Number) cmd.getParsedOptionValue("lookahead")).intValue());
- }
- if (cmd.hasOption("lookback")) {
- updater.activateLookback(((Number) cmd.getParsedOptionValue("lookback")).intValue());
- }
- if (cmd.hasOption("prefixDeduplication")) {
- updater.activatePrefixDeduplication();
- }
- // READ GRAPHSOUT PARAMETERS
- if (cmd.hasOption("graphsout")) {
- String[] graphsoutArgs = cmd.getOptionValues("graphsout");
- String outputDir = graphsoutArgs[0];
- List outputSentences = Arrays.asList(Arrays.copyOfRange(graphsoutArgs, 1, graphsoutArgs.length));
- updater.activateGraphsOut(outputDir, outputSentences);
- }
- // READ TRIPLESOUT PARAMETERS
- if (cmd.hasOption("triplesout")) {
- String[] triplesoutArgs = cmd.getOptionValues("triplesout");
- String outputDir = triplesoutArgs[0];
- List outputSentences = Arrays.asList(Arrays.copyOfRange(triplesoutArgs, 1, triplesoutArgs.length));
- updater.activateTriplesOut(outputDir, outputSentences);
- }
-
- if (cmd.hasOption("model")) {
- for (Option opt : cmd.getOptions()) {
- if (opt.getOpt().equals("model")) { // opt.equals(model)
- String[] model = opt.getValues();
- try {
- if (model.length == 1) {
- updater.loadGraph(new URI(model[0]), new URI(model[0]));
- } else if (model.length == 2) {
- updater.loadGraph(new URI(model[0]), new URI(model[1]));
- } else {
- throw new ParseException("Error while loading model: Please provide one or two URIs");
- }
- } catch (URISyntaxException e) {
- throw new ParseException("Error while loading model: Could not parse given arguments as URI");
- }
- }
- }
- }
-
- if (cmd.hasOption("updates")) {
- List> updates = new ArrayList<>();
- for (String arg : Arrays.asList(cmd.getOptionValues("updates"))) {
- Pair parsed = parseUpdate(arg);
- // should be <#UPDATEFILENAMEORSTRING, #UPDATESTRING, #UPDATEITER>
- updates.add(new ImmutableTriple(parsed.getKey(), parsed.getKey(),
- parsed.getValue()));
- }
- updater.parseUpdates(updates);
- }
- return updater;
- }
-
- @Override
- public CoNLLRDFUpdater buildFromJsonConf(ObjectNode conf) throws IOException, ParseException {
- // READ THREAD PARAMETERS
- int threads = 0;
- if (conf.get("threads") != null)
- threads = conf.get("threads").asInt(0);
- CoNLLRDFUpdater updater = new CoNLLRDFUpdater("","",threads);
-
- // READ GRAPHSOUT PARAMETERS
- if (conf.get("graphsoutDIR") != null) {
- String graphOutputDir = conf.get("graphsoutDIR").asText("");
- if (!graphOutputDir.equals("")) {
- List graphOutputSentences = new ArrayList();
- for (JsonNode snt:conf.withArray("graphsoutSNT")) {
- graphOutputSentences.add(snt.asText());
- }
- updater.activateGraphsOut(graphOutputDir, graphOutputSentences);
- }
- }
-
- // READ TRIPLESOUT PARAMETERS
- if (conf.get("triplesoutDIR") != null) {
- String triplesOutputDir = conf.get("triplesoutDIR").asText("");
- if (!triplesOutputDir.equals("")) {
- List triplesOutputSentences = new ArrayList();
- for (JsonNode snt:conf.withArray("triplesoutSNT")) {
- triplesOutputSentences.add(snt.asText());
- }
- updater.activateTriplesOut(triplesOutputDir, triplesOutputSentences);
- }
- }
-
- // READ LOOKAHEAD PARAMETERS
- if (conf.get("lookahead") != null) {
- int lookahead_snts = conf.get("lookahead").asInt(0);
- if (lookahead_snts > 0)
- updater.activateLookahead(lookahead_snts);
- }
-
- // READ LOOKBACK PARAMETERS
- if (conf.get("lookback") != null) {
- int lookback_snts = conf.get("lookback").asInt(0);
- if (lookback_snts > 0)
- updater.activateLookback(lookback_snts);
- }
-
- // READ PREFIX DEDUPLICATION
- if (conf.get("prefixDeduplication") != null) {
- Boolean prefixDeduplication = conf.get("prefixDeduplication").asBoolean();
- if (prefixDeduplication)
- updater.activatePrefixDeduplication();
- }
-
- // READ ALL UPDATES
- // should be <#UPDATEFILENAMEORSTRING, #UPDATESTRING, #UPDATEITER>
- List> updates = new ArrayList>();
- for (JsonNode update:conf.withArray("updates")) {
- String freq = update.get("iter").asText("1");
- if (freq.equals("u"))
- freq = "*";
- try {
- Integer.parseInt(freq);
- } catch (NumberFormatException e) {
- if (!"*".equals(freq))
- throw e;
- }
- String path = update.get("path").asText();
- updates.add(new ImmutableTriple(path, path, freq));
- }
- updater.parseUpdates(updates);
-
- // READ ALL MODELS
- for (JsonNode model:conf.withArray("models")) {
- List models = new ArrayList();
- String uri = model.get("source").asText();
- if (!uri.equals("")) models.add(uri);
- uri = model.get("graph").asText();
- if (!uri.equals("")) models.add(uri);
- if (models.size()==1) {
- try {
- updater.loadGraph(new URI(models.get(0)), new URI(models.get(0)));
- } catch (URISyntaxException e) {
- throw new IOException(e);
- }
- } else if (models.size()==2){
- try {
- updater.loadGraph(new URI(models.get(0)), new URI(models.get(1)));
- } catch (URISyntaxException e) {
- throw new IOException(e);
- }
- } else if (models.size()>2){
- throw new IOException("Error while loading model: Please specify model source URI and graph destination.");
- }
- models.removeAll(models);
- }
-
- return updater;
- }
-}
+package org.acoli.conll.rdf;
+
+import static org.acoli.conll.rdf.CoNLLRDFCommandLine.parseUpdate;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+
+import org.acoli.fintan.core.FintanStreamComponentFactory;
+import org.apache.commons.cli.*;
+import org.apache.commons.lang3.tuple.ImmutableTriple;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.lang3.tuple.Triple;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+public class CoNLLRDFUpdaterFactory implements FintanStreamComponentFactory {
+ static Logger LOG = LogManager.getLogger(CoNLLRDFUpdaterFactory.class);
+ @Override
+ public CoNLLRDFUpdater buildFromCLI(String[] args) throws IOException, ParseException {
+ CoNLLRDFUpdater updater = new CoNLLRDFUpdater();
+ final CommandLine cmd = new CoNLLRDFCommandLine(
+ "CoNLLRDFUpdater [-loglevel LEVEL] [-threads T] [-lookahead N] [-lookback N] [-custom [-model URI [GRAPH]]* [-graphsout DIR [SENT_ID ...]] [-triplesout DIR [SENT_ID ...]] -updates [UPDATE ...]]",
+ "read TTL from stdin => update CoNLL-RDF", new Option[] {
+ // Define cli options in the correct order for the help-message
+ Option.builder("loglevel").hasArg().desc("set log level to LEVEL").argName("level").build(),
+ Option.builder("threads").hasArg()
+ .desc("use T threads max\ndefault: half of available logical processor cores")
+ .type(Number.class).build(),
+ Option.builder("lookahead").hasArg().desc("cache N further sentences in lookahead graph")
+ .type(Number.class).build(),
+ Option.builder("lookback").hasArg().desc("cache N preceeding sentences in lookback graph")
+ .type(Number.class).build(),
+ new Option("prefixDeduplication", false, "Remove duplicates of TTL-Prefixes"),
+ Option.builder("custom").hasArg(false).desc("use custom update scripts")
+ ./* required(). */build(),
+ Option.builder("model").hasArgs().desc("to load additional Models into local graph").build(),
+ Option.builder("graphsout").hasArgs().desc(
+ "output directory for the .dot graph files\nfollowed by the IDs of the sentences to be visualized\ndefault: first sentence only")
+ .build(),
+ Option.builder("triplesout").hasArgs()
+ .desc("same as graphsout but write N-TRIPLES for text debug instead.").build(),
+ Option.builder("updates").hasArgs()
+ .desc("followed by SPARQL scripts paired with {iterations/u}").build() },
+ CoNLLRDFUpdater.LOG).parseArgs(args);
+
+ if (cmd.hasOption("threads")) {
+ updater.setThreads(((Number) cmd.getParsedOptionValue("threads")).intValue());
+ }
+ if (cmd.hasOption("lookahead")) {
+ updater.activateLookahead(((Number) cmd.getParsedOptionValue("lookahead")).intValue());
+ }
+ if (cmd.hasOption("lookback")) {
+ updater.activateLookback(((Number) cmd.getParsedOptionValue("lookback")).intValue());
+ }
+ if (cmd.hasOption("prefixDeduplication")) {
+ updater.activatePrefixDeduplication();
+ }
+ // READ GRAPHSOUT PARAMETERS
+ if (cmd.hasOption("graphsout")) {
+ String[] graphsoutArgs = cmd.getOptionValues("graphsout");
+ String outputDir = graphsoutArgs[0];
+ List outputSentences = Arrays.asList(Arrays.copyOfRange(graphsoutArgs, 1, graphsoutArgs.length));
+ updater.activateGraphsOut(outputDir, outputSentences);
+ }
+ // READ TRIPLESOUT PARAMETERS
+ if (cmd.hasOption("triplesout")) {
+ String[] triplesoutArgs = cmd.getOptionValues("triplesout");
+ String outputDir = triplesoutArgs[0];
+ List outputSentences = Arrays.asList(Arrays.copyOfRange(triplesoutArgs, 1, triplesoutArgs.length));
+ updater.activateTriplesOut(outputDir, outputSentences);
+ }
+
+ if (cmd.hasOption("model")) {
+ for (Option opt : cmd.getOptions()) {
+ if (opt.getOpt().equals("model")) { // opt.equals(model)
+ String[] model = opt.getValues();
+ try {
+ if (model.length == 1) {
+ updater.loadGraph(new URI(model[0]), new URI(model[0]));
+ } else if (model.length == 2) {
+ updater.loadGraph(new URI(model[0]), new URI(model[1]));
+ } else {
+ throw new ParseException("Error while loading model: Please provide one or two URIs");
+ }
+ } catch (URISyntaxException e) {
+ throw new ParseException("Error while loading model: Could not parse given arguments as URI");
+ }
+ }
+ }
+ }
+
+ if (cmd.hasOption("updates")) {
+ List> updates = new ArrayList<>();
+ for (String arg : Arrays.asList(cmd.getOptionValues("updates"))) {
+ Pair parsed = parseUpdate(arg);
+ // should be <#UPDATEFILENAMEORSTRING, #UPDATESTRING, #UPDATEITER>
+ updates.add(new ImmutableTriple(parsed.getKey(), parsed.getKey(),
+ parsed.getValue()));
+ }
+ updater.parseUpdates(updates);
+ }
+ return updater;
+ }
+
+ @Override
+ public CoNLLRDFUpdater buildFromJsonConf(ObjectNode conf) throws IOException, ParseException {
+ // READ THREAD PARAMETERS
+ int threads = 0;
+ if (conf.get("threads") != null)
+ threads = conf.get("threads").asInt(0);
+ CoNLLRDFUpdater updater = new CoNLLRDFUpdater("","",threads);
+
+ // READ GRAPHSOUT PARAMETERS
+ if (conf.get("graphsoutDIR") != null) {
+ String graphOutputDir = conf.get("graphsoutDIR").asText("");
+ if (!graphOutputDir.equals("")) {
+ List graphOutputSentences = new ArrayList();
+ for (JsonNode snt:conf.withArray("graphsoutSNT")) {
+ graphOutputSentences.add(snt.asText());
+ }
+ updater.activateGraphsOut(graphOutputDir, graphOutputSentences);
+ }
+ }
+
+ // READ TRIPLESOUT PARAMETERS
+ if (conf.get("triplesoutDIR") != null) {
+ String triplesOutputDir = conf.get("triplesoutDIR").asText("");
+ if (!triplesOutputDir.equals("")) {
+ List triplesOutputSentences = new ArrayList();
+ for (JsonNode snt:conf.withArray("triplesoutSNT")) {
+ triplesOutputSentences.add(snt.asText());
+ }
+ updater.activateTriplesOut(triplesOutputDir, triplesOutputSentences);
+ }
+ }
+
+ // READ LOOKAHEAD PARAMETERS
+ if (conf.get("lookahead") != null) {
+ int lookahead_snts = conf.get("lookahead").asInt(0);
+ if (lookahead_snts > 0)
+ updater.activateLookahead(lookahead_snts);
+ }
+
+ // READ LOOKBACK PARAMETERS
+ if (conf.get("lookback") != null) {
+ int lookback_snts = conf.get("lookback").asInt(0);
+ if (lookback_snts > 0)
+ updater.activateLookback(lookback_snts);
+ }
+
+ // READ PREFIX DEDUPLICATION
+ if (conf.get("prefixDeduplication") != null) {
+ Boolean prefixDeduplication = conf.get("prefixDeduplication").asBoolean();
+ if (prefixDeduplication)
+ updater.activatePrefixDeduplication();
+ }
+
+ // READ ALL UPDATES
+ // should be <#UPDATEFILENAMEORSTRING, #UPDATESTRING, #UPDATEITER>
+ List> updates = new ArrayList>();
+ for (JsonNode update:conf.withArray("updates")) {
+ String freq = update.get("iter").asText("1");
+ if (freq.equals("u"))
+ freq = "*";
+ try {
+ Integer.parseInt(freq);
+ } catch (NumberFormatException e) {
+ if (!"*".equals(freq))
+ throw e;
+ }
+ String path = update.get("path").asText();
+ updates.add(new ImmutableTriple(path, path, freq));
+ }
+ updater.parseUpdates(updates);
+
+ // READ ALL MODELS
+ for (JsonNode model:conf.withArray("models")) {
+ List models = new ArrayList();
+ String uri = model.get("source").asText();
+ if (!uri.equals("")) models.add(uri);
+ uri = model.get("graph").asText();
+ if (!uri.equals("")) models.add(uri);
+ if (models.size()==1) {
+ try {
+ updater.loadGraph(new URI(models.get(0)), new URI(models.get(0)));
+ } catch (URISyntaxException e) {
+ throw new IOException(e);
+ }
+ } else if (models.size()==2){
+ try {
+ updater.loadGraph(new URI(models.get(0)), new URI(models.get(1)));
+ } catch (URISyntaxException e) {
+ throw new IOException(e);
+ }
+ } else if (models.size()>2){
+ throw new IOException("Error while loading model: Please specify model source URI and graph destination.");
+ }
+ models.removeAll(models);
+ }
+
+ return updater;
+ }
+}
diff --git a/src/main/java/org/acoli/conll/rdf/CoNLLRDFUtil.java b/src/main/java/org/acoli/conll/rdf/CoNLLRDFUtil.java
new file mode 100644
index 0000000..5efb626
--- /dev/null
+++ b/src/main/java/org/acoli/conll/rdf/CoNLLRDFUtil.java
@@ -0,0 +1,84 @@
+package org.acoli.conll.rdf;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringWriter;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.jena.query.QueryExecution;
+import org.apache.jena.query.QueryExecutionFactory;
+import org.apache.jena.query.ResultSet;
+import org.apache.jena.rdf.model.Model;
+
+public class CoNLLRDFUtil {
+
+ /**
+ * FOR LEO: please move whereever you like
+ *
+ * @param model CoNLL-RDF sentence as Model
+ * @return String[0]: all comments + \n String[1]: model as Turtle (unsorted)
+ * concatenate: Full CoNLL-RDF output
+ */
+ public static String conllRdfModel2String(Model model) {
+ final String comments = rdfComments2String(model);
+
+ // generate CoNLL-RDF Turtle (unsorted)
+ StringWriter modelOut = new StringWriter();
+ model.write(modelOut, "TTL");
+ final String modelString = modelOut.toString();
+
+ return comments + modelString;
+ }
+
+ /**
+ * @param model CoNLL-RDF sentence as Model
+ * @return String: all comments + \n
+ */
+ private static String rdfComments2String(Model model) {
+ // generate comments in out[0]
+ String out = new String();
+ String selectComments = "PREFIX nif: \n"
+ + "PREFIX rdfs: