From 662ef40c69d013756631047d5f16e0fdec0c173b Mon Sep 17 00:00:00 2001 From: Richard Palmer Date: Sat, 4 Sep 2021 17:32:24 +0100 Subject: [PATCH 1/4] Add fact checking rules --- .../api/configuration/schema/Rule.java | 84 +++++++++++++++++++ .../metadataqa/api/schema/BaseSchema.java | 2 + .../metadataqa/api/schema/SchemaUtils.java | 37 +++++++- 3 files changed, 120 insertions(+), 3 deletions(-) diff --git a/src/main/java/de/gwdg/metadataqa/api/configuration/schema/Rule.java b/src/main/java/de/gwdg/metadataqa/api/configuration/schema/Rule.java index ee3cfb65..e64433fe 100644 --- a/src/main/java/de/gwdg/metadataqa/api/configuration/schema/Rule.java +++ b/src/main/java/de/gwdg/metadataqa/api/configuration/schema/Rule.java @@ -24,6 +24,9 @@ public class Rule implements Serializable { private Double minInclusive; private Double maxExclusive; private Double maxInclusive; + private String dateEarliest; + private String dateLatest; + private String dateEqual; private Integer minLength; private Integer maxLength; private String lessThan; @@ -31,6 +34,9 @@ public class Rule implements Serializable { private String hasValue; private Boolean unique; private List contentType; + private String kBase; // Knowledge Base + private String entityAbsenceCheck; + private String factCheck; public String getId() { return id; @@ -237,6 +243,45 @@ public Rule withMaxInclusive(Integer maxInclusive) { return this; } + public String getDateEarliest() { + return dateEarliest; + } + + public void setDateEarliest(String dateEarliest) { + this.dateEarliest = dateEarliest; + } + + public Rule withDateEarliest(String dateEarliest) { + setDateEarliest(dateEarliest); + return this; + } + + public String getDateLatest() { + return dateLatest; + } + + public void setDateLatest(String dateLatest) { + this.dateLatest = dateLatest; + } + + public Rule withDateLatest(String dateLatest) { + setDateLatest(dateLatest); + return this; + } + + public String getDateEqual() { + return dateEqual; + } + + public void setDateEqual(String dateEqual) { + this.dateEqual = dateEqual; + } + + public Rule withDateEqual(String dateEqual) { + setDateEqual(dateEqual); + return this; + } + public Integer getMinLength() { return minLength; } @@ -361,4 +406,43 @@ public Rule withContentType(List contentType) { this.contentType = contentType; return this; } + + public String getEntityAbsenceCheck() { + return entityAbsenceCheck; + } + + public void setEntityAbsenceCheck(String entityAbsenceCheck) { + this.entityAbsenceCheck = entityAbsenceCheck; + } + + public Rule withEntityAbsenceCheck(String entityAbsenceCheck) { + setEntityAbsenceCheck(entityAbsenceCheck); + return this; + } + + public String getFactCheck() { + return factCheck; + } + + public void setFactCheck(String factCheck) { + this.factCheck = factCheck; + } + + public Rule withFactCheck(String factCheck) { + setFactCheck(factCheck); + return this; + } + + public String getkBase() { + return kBase; + } + + public void setkBase(String kBase) { + this.kBase = kBase; + } + + public Rule withkBase(String kBase) { + setkBase(kBase); + return this; + } } diff --git a/src/main/java/de/gwdg/metadataqa/api/schema/BaseSchema.java b/src/main/java/de/gwdg/metadataqa/api/schema/BaseSchema.java index 6263e427..3e16893b 100644 --- a/src/main/java/de/gwdg/metadataqa/api/schema/BaseSchema.java +++ b/src/main/java/de/gwdg/metadataqa/api/schema/BaseSchema.java @@ -27,6 +27,8 @@ public class BaseSchema implements Schema, CsvAwareSchema, Serializable { private Format format; private Map namespaces; + // Knowledgebases + public BaseSchema() { // initialize without parameters } diff --git a/src/main/java/de/gwdg/metadataqa/api/schema/SchemaUtils.java b/src/main/java/de/gwdg/metadataqa/api/schema/SchemaUtils.java index ac4a976a..5effcb38 100644 --- a/src/main/java/de/gwdg/metadataqa/api/schema/SchemaUtils.java +++ b/src/main/java/de/gwdg/metadataqa/api/schema/SchemaUtils.java @@ -7,6 +7,8 @@ import de.gwdg.metadataqa.api.rule.logical.OrChecker; import de.gwdg.metadataqa.api.rule.pairchecker.DisjointChecker; import de.gwdg.metadataqa.api.rule.pairchecker.LessThanPairChecker; +import de.gwdg.metadataqa.api.rule.pairchecker.EntityAbsenceChecker; +import de.gwdg.metadataqa.api.rule.pairchecker.FactChecker; import de.gwdg.metadataqa.api.rule.singlefieldchecker.ContentTypeChecker; import de.gwdg.metadataqa.api.rule.singlefieldchecker.EnumerationChecker; import de.gwdg.metadataqa.api.rule.pairchecker.EqualityChecker; @@ -16,6 +18,7 @@ import de.gwdg.metadataqa.api.rule.singlefieldchecker.MinCountChecker; import de.gwdg.metadataqa.api.rule.singlefieldchecker.MinLengthChecker; import de.gwdg.metadataqa.api.rule.singlefieldchecker.NumericValueChecker; +import de.gwdg.metadataqa.api.rule.singlefieldchecker.DateValueChecker; import de.gwdg.metadataqa.api.rule.singlefieldchecker.PatternChecker; import de.gwdg.metadataqa.api.rule.RuleChecker; import org.apache.commons.lang3.StringUtils; @@ -29,6 +32,10 @@ import static de.gwdg.metadataqa.api.rule.singlefieldchecker.NumericValueChecker.TYPE.MIN_EXCLUSIVE; import static de.gwdg.metadataqa.api.rule.singlefieldchecker.NumericValueChecker.TYPE.MIN_INCLUSIVE; +import static de.gwdg.metadataqa.api.rule.singlefieldchecker.DateValueChecker.TYPE.DATE_EARLIEST; +import static de.gwdg.metadataqa.api.rule.singlefieldchecker.DateValueChecker.TYPE.DATE_LATEST; +import static de.gwdg.metadataqa.api.rule.singlefieldchecker.DateValueChecker.TYPE.DATE_EQUAL; + public class SchemaUtils { private static final Logger LOGGER = Logger.getLogger(SchemaUtils.class.getCanonicalName()); @@ -91,13 +98,25 @@ private static List processRule(Schema schema, JsonBranch branch, R ruleCheckers.add(new NumericValueChecker(branch, rule.getMinInclusive(), MIN_INCLUSIVE)); if (rule.getMaxInclusive() != null) - ruleCheckers.add(new NumericValueChecker(branch, rule.getMinInclusive(), MAX_INCLUSIVE)); + ruleCheckers.add(new NumericValueChecker(branch, rule.getMaxInclusive(), MAX_INCLUSIVE)); if (rule.getMinExclusive() != null) ruleCheckers.add(new NumericValueChecker(branch, rule.getMinInclusive(), MIN_EXCLUSIVE)); if (rule.getMaxExclusive() != null) - ruleCheckers.add(new NumericValueChecker(branch, rule.getMinInclusive(), MAX_EXCLUSIVE)); + ruleCheckers.add(new NumericValueChecker(branch, rule.getMaxInclusive(), MAX_EXCLUSIVE)); + + if (rule.getDateEarliest() != null) + ruleCheckers.add(new DateValueChecker(branch, rule.getDateEarliest(), DATE_EARLIEST)); + + if (rule.getDateLatest() != null) + ruleCheckers.add(new DateValueChecker(branch, rule.getDateLatest(), DATE_LATEST)); + + if (rule.getDateEqual() != null) + ruleCheckers.add(new DateValueChecker(branch, rule.getDateEqual(), DATE_EQUAL)); + + if (rule.getContentType() != null && !rule.getContentType().isEmpty()) + ruleCheckers.add(new ContentTypeChecker(branch, rule.getContentType())); if (rule.getContentType() != null && !rule.getContentType().isEmpty()) ruleCheckers.add(new ContentTypeChecker(branch, rule.getContentType())); @@ -108,6 +127,14 @@ private static List processRule(Schema schema, JsonBranch branch, R if (rule.getLessThanOrEquals() != null) pair(schema, ruleCheckers, branch, rule.getLessThan(), "lessThanOrEquals"); + if (rule.getEntityAbsenceCheck() != null) { + pair(schema, ruleCheckers, branch, rule.getEntityAbsenceCheck(), "EntityAbsenceChecker"); + } + + if (rule.getFactCheck() != null) { + pair(schema, ruleCheckers, branch, rule.getFactCheck(), "FactChecker"); + } + if (rule.getAnd() != null) { List childRuleCheckers = getChildRuleCheckers(schema, branch, rule.getAnd()); ruleCheckers.add(new AndChecker(branch, childRuleCheckers)); @@ -160,7 +187,11 @@ private static void pair(Schema schema, ruleChecker = new LessThanPairChecker(branch, field2, LessThanPairChecker.TYPE.LESS_THAN); } else if ("lessThanOrEquals".equals(type)) { ruleChecker = new LessThanPairChecker(branch, field2, LessThanPairChecker.TYPE.LESS_THAN_OR_EQUALS); - } + } else if ("EntityAbsenceChecker".equals(type)) { + ruleChecker = new EntityAbsenceChecker(branch, field2); + } else if ("FactChecker".equals(type)) { + ruleChecker = new FactChecker(branch, field2); + } if (ruleChecker != null) ruleCheckers.add(ruleChecker); From 12d16c283151687807fd2412b9f71d256c7961c5 Mon Sep 17 00:00:00 2001 From: Richard Palmer Date: Sat, 4 Sep 2021 17:32:48 +0100 Subject: [PATCH 2/4] Update gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9cde48ce..d902a228 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ target/ -/nbproject/ \ No newline at end of file +/nbproject/ +*~ From b3d82ed6f0c90ca3a55379201428629a4213af40 Mon Sep 17 00:00:00 2001 From: Richard Palmer Date: Sun, 12 Sep 2021 15:29:38 +0100 Subject: [PATCH 3/4] Detect absent entities --- .../pairchecker/EntityAbsenceChecker.java | 163 ++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 src/main/java/de/gwdg/metadataqa/api/rule/pairchecker/EntityAbsenceChecker.java diff --git a/src/main/java/de/gwdg/metadataqa/api/rule/pairchecker/EntityAbsenceChecker.java b/src/main/java/de/gwdg/metadataqa/api/rule/pairchecker/EntityAbsenceChecker.java new file mode 100644 index 00000000..57d678f9 --- /dev/null +++ b/src/main/java/de/gwdg/metadataqa/api/rule/pairchecker/EntityAbsenceChecker.java @@ -0,0 +1,163 @@ +package de.gwdg.metadataqa.api.rule.pairchecker; + +import de.gwdg.metadataqa.api.counter.FieldCounter; +import de.gwdg.metadataqa.api.json.JsonBranch; +import de.gwdg.metadataqa.api.model.XmlFieldInstance; +import de.gwdg.metadataqa.api.model.pathcache.PathCache; +import de.gwdg.metadataqa.api.rule.RuleCheckerOutput; +import de.gwdg.metadataqa.api.rule.RuleCheckingOutputType; + +import java.util.List; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Map; +import java.util.LinkedHashMap; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.net.http.HttpRequest.BodyPublishers; +import java.net.http.HttpResponse.BodyHandlers; +import net.minidev.json.JSONObject; +import net.minidev.json.JSONValue; +import net.minidev.json.JSONArray; +import java.nio.file.Paths; +import java.io.IOException; +import java.lang.InterruptedException; +import java.io.FileReader; +import com.opencsv.CSVReader; +//import com.opencsv.exceptions.CsvValidationException; + +public class EntityAbsenceChecker extends PropertyPairChecker { + + private static final long serialVersionUID = -5363342097255677979L; + public static final String PREFIX = "entityabsence"; + protected String fixedValue; + private HttpClient client; + private static List kbWords = null; + private static Map> NERCache = null; + + public EntityAbsenceChecker(JsonBranch field1, JsonBranch field2) { + super(field1, field2, PREFIX); + + if(kbWords == null) { + // + kbWords = new ArrayList<>(); + try (CSVReader csvReader = new CSVReader(new FileReader("/tmp/places.csv"))) { + String[] values = null; + try { + while ((values = csvReader.readNext()) != null) { + kbWords.add(values[0].toLowerCase()); + } + } catch(Exception e) { + System.out.println("CSV Validation Exception error"); + } + } catch(IOException e) { + System.out.println("KB IO error"); + } + } + + if(NERCache == null) { + NERCache = new LinkedHashMap<>(); + } + + } + + @Override + public void update(PathCache cache, FieldCounter results) { + var allPassed = true; + var isNA = true; + List new_entities = new ArrayList<>(); + int unknown_entities = 0; + + List sources = cache.get(field2.getAbsoluteJsonPath()); + List knowns = cache.get(field1.getAbsoluteJsonPath()); + + // First we send field1 to a NER service to extract nouns (root) + if (sources != null && !sources.isEmpty()) { + List known_entities = new ArrayList<>(); + + // We find all the known entities + if(knowns != null) { + for (XmlFieldInstance known: knowns) { + known_entities.add(known.getValue().toLowerCase()); + } + } + + for (XmlFieldInstance source : sources) { + if (source.hasValue()) { + List ner_words = new ArrayList<>(); + isNA = false; + + // Check if we already have the results and use instead + if(NERCache != null && NERCache.containsKey(source.getValue())) { + for (Object word: NERCache.get(source.getValue())) { + ner_words.add((String)word); + } + } else { + + // We need to send to NER and cache the response + var payload_json = new JSONObject().appendField("text", + source.getValue()).appendField("model", "en").toJSONString(); + + // Query via HTTP API to Spacy + var request = HttpRequest.newBuilder( + URI.create("http://127.0.0.1:8280/dep")) + .header("accept", "application/json") + .POST(BodyPublishers.ofString(payload_json)) + .build(); + + client = HttpClient.newHttpClient(); + + HttpResponse ner_response = null; + + try { + ner_response = client.send(request, BodyHandlers.ofString()); + } catch(IOException e) { + System.out.println("IO error"); + } catch(InterruptedException e) { + System.out.println("NER request interrupted"); + } + + if(ner_response != null) { + JSONObject ner_json = (JSONObject) JSONValue.parse(ner_response.body()); + + JSONArray words = (JSONArray) ner_json.get("words"); + for (Object word: words) { + JSONObject word_json = (JSONObject) word; + // For the moment, only consider tagged nouns + if("NN".equals((String)word_json.get("tag"))) { + ner_words.add(((String)word_json.get("text")).toLowerCase()); + } + } + + // Save to cache + NERCache.put(source.getValue(), ner_words); + } + + if(ner_words.size() > 0) { + for(String word: ner_words) { + if(kbWords.contains(word)) { + // Check if we already have it in the target field + if(!known_entities.contains(word)) { + unknown_entities += 1; + new_entities.add(word); + } + } + } + } + } + + } + } + } + + if(unknown_entities > 0) { + results.put(getHeader(), new RuleCheckerOutput(RuleCheckingOutputType.FAILED, unknown_entities)); + // Todo - how to pass to output the details of the failed rule + } else { + results.put(getHeader(), new RuleCheckerOutput(RuleCheckingOutputType.PASSED, 0)); + } + } + +} From de77be86b842b1bc542c139ec539e602abe21d0f Mon Sep 17 00:00:00 2001 From: Richard Palmer Date: Sun, 12 Sep 2021 15:29:55 +0100 Subject: [PATCH 4/4] Fact-checking --- .../api/rule/pairchecker/FactChecker.java | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 src/main/java/de/gwdg/metadataqa/api/rule/pairchecker/FactChecker.java diff --git a/src/main/java/de/gwdg/metadataqa/api/rule/pairchecker/FactChecker.java b/src/main/java/de/gwdg/metadataqa/api/rule/pairchecker/FactChecker.java new file mode 100644 index 00000000..24600d8c --- /dev/null +++ b/src/main/java/de/gwdg/metadataqa/api/rule/pairchecker/FactChecker.java @@ -0,0 +1,150 @@ +package de.gwdg.metadataqa.api.rule.pairchecker; + +import de.gwdg.metadataqa.api.counter.FieldCounter; +import de.gwdg.metadataqa.api.json.JsonBranch; +import de.gwdg.metadataqa.api.model.XmlFieldInstance; +import de.gwdg.metadataqa.api.model.pathcache.PathCache; +import de.gwdg.metadataqa.api.rule.RuleCheckerOutput; +import de.gwdg.metadataqa.api.rule.RuleCheckingOutputType; + +import java.util.List; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Map; +import java.util.LinkedHashMap; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.net.http.HttpRequest.BodyPublishers; +import java.net.http.HttpResponse.BodyHandlers; +import net.minidev.json.JSONObject; +import net.minidev.json.JSONValue; +import net.minidev.json.JSONArray; +import java.nio.file.Paths; +import java.io.IOException; +import java.lang.InterruptedException; +import java.io.FileReader; +import com.opencsv.CSVReader; +//import com.opencsv.exceptions.CsvValidationException; + +public class FactChecker extends PropertyPairChecker { + + private static final long serialVersionUID = -5363342097255677979L; + public static final String PREFIX = "entityabsence"; + protected String fixedValue; + private HttpClient client; + private static Map> kbFacts = null; + + public FactChecker(JsonBranch field1, JsonBranch field2) { + super(field1, field2, PREFIX); + + if(kbFacts == null) { + // + kbFacts = new LinkedHashMap<>(); + try (CSVReader csvReader = new CSVReader(new FileReader("/tmp/techniques-kb.csv"))) { + Integer[] years = null; + String[] values = null; + try { + while ((values = csvReader.readNext()) != null) { + // Earliest Year + int earliest_year; + int latest_year = 9999; + + earliest_year = Integer.parseInt(values[1]); + + // Latest Year (if known) + if(values.length > 2) { + latest_year = Integer.parseInt(values[2]); + } + + kbFacts.put(values[0].toLowerCase(), Arrays.asList(earliest_year, latest_year)); + } + // TODO - this throws an unknown class exception at runtime +// } catch(CsvValidationException e) { + } catch(Exception e) { + System.out.println("CSV Validation Exception error"); + } + } catch(IOException e) { + System.out.println("KB IO error"); + } + } + + } + + @Override + public void update(PathCache cache, FieldCounter results) { + var allPassed = true; + var isNA = true; + int unknown_entities = 0; + + // Q why remove array ? + //List sources = cache.get(field2.getAbsoluteJsonPath().replace("[*]", "")); + // List known = cache.get(field1.getAbsoluteJsonPath().replace("[*]", "")); + List materials = cache.get(field1.getAbsoluteJsonPath()); + List dates = cache.get(field2.getAbsoluteJsonPath()); + + // First we send field1 to a NER service to extract nouns (root) + if (dates != null && !dates.isEmpty()) { + Integer earliest_prod = 9999; + Integer latest_prod = -9999; + + // Save the earliest and latest dates + + for (XmlFieldInstance date : dates) { + if (date.hasValue() && date.getValue() != "") { + // To avoid date manipulation, we just compare years as integers + String date_str = date.getValue(); + Integer earliest_year; +// Integer latest_year = Integer.parseInt(date.getValue().split("-")[0]); + if(date_str.charAt(0) == '-') { + earliest_year = 0 - Integer.parseInt(date_str.split("-")[1]); +// latest_year = 0 - Integer.parseInt(date.getValue().split("-")[0]); + } else { + earliest_year = Integer.parseInt(date_str.split("-")[0]); +// latest_year = Integer.parseInt(date.getValue().split("-")[0]); + } + + if(earliest_year < earliest_prod) { + earliest_prod = earliest_year; + } +// if(latest_year > latest_prod) { +// latest_prod = latest_year; +// } + } + } + + if(materials != null) { + for (XmlFieldInstance material : materials) { + if (material.hasValue()) { + isNA = false; + + // Lookup material in facts KB to get valid from and to years + if(kbFacts.containsKey(material.getValue().toLowerCase())) { + System.out.println("Material in KB: " + material.getValue()); + Listvalid_dates = kbFacts.get(material.getValue().toLowerCase()); + + System.out.println("Material KB Earliest " + valid_dates.get(0)); + System.out.println("Object Earliest " + earliest_prod); + + if(valid_dates.get(0) > earliest_prod) { + System.out.println("INVALID DATE"); + allPassed = false; + } else if(valid_dates.get(1) < latest_prod) { + allPassed = false; + } + } + + // Compare years. Valid from should be before earliest, valid to should be after latest + // if latest is set + // if not set allPassed to false + + } + } + } + } + + results.put(getHeader(), new RuleCheckerOutput(this, isNA, allPassed)); + } + +}