From 8a7544d4d9eaffedd617f19ec56c035e1c4cf29b Mon Sep 17 00:00:00 2001 From: Bart Hanssens Date: Tue, 9 Jul 2024 18:14:54 +0100 Subject: [PATCH] GH-5058: additional parser code (WIP) --- .../eclipse/rdf4j/rio/csvw/CSVWParser.java | 72 ++++++--- .../rdf4j/rio/csvw/parsers/CellParser.java | 88 +++++++---- .../rio/csvw/parsers/CellParserFactory.java | 72 ++------- .../rdf4j/rio/csvw/parsers/Parser.java | 139 ------------------ 4 files changed, 120 insertions(+), 251 deletions(-) delete mode 100644 core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/Parser.java diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java index f31a8c7c62..decd55d37a 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java @@ -17,15 +17,10 @@ import java.io.Reader; import java.net.URI; import java.util.ArrayList; -import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Optional; -import java.util.logging.Level; -import java.util.logging.Logger; import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.Stream; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Model; @@ -45,7 +40,8 @@ import org.eclipse.rdf4j.rio.RDFHandlerException; import org.eclipse.rdf4j.rio.RDFParseException; import org.eclipse.rdf4j.rio.Rio; -import org.eclipse.rdf4j.rio.csvw.parsers.Parser; +import org.eclipse.rdf4j.rio.csvw.parsers.CellParserFactory; +import org.eclipse.rdf4j.rio.csvw.parsers.CellParser; import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser; import org.slf4j.LoggerFactory; @@ -90,13 +86,15 @@ public synchronized void parse(InputStream in, String baseURI) if (csvFile == null) { throw new RDFParseException("Could not find URL"); } - metadata.getNamespaces().add(new SimpleNamespace("", csvFile.toString() + "#")); + // add dummy namespace for resolving unspecified column names / predicates relative to CSV file + metadata.getNamespaces().add(new SimpleNamespace("_local", csvFile.toString() + "#")); + Resource tableSchema = getTableSchema(metadata, (Resource) table); List columns = getColumns(metadata, tableSchema); - Parser[] cellParsers = columns.stream() + CellParser[] cellParsers = columns.stream() .map(c -> getCellParser(metadata, (Resource) c)) .collect(Collectors.toList()) - .toArray(new Parser[columns.size()]); + .toArray(new CellParser[columns.size()]); parseCSV(metadata, rdfHandler, csvFile, cellParsers, (Resource) table); } @@ -205,13 +203,16 @@ private List getColumns(Model metadata, Resource tableSchema) throws RDFP } /** + * Get parser for specific column * * @param metadata - * @param table + * @param column * @return */ - private Parser getCellParser(Model metadata, Resource column) { - Parser parser = new Parser(); + private CellParser getCellParser(Model metadata, Resource column) { + IRI datatype = getDatatypeIRI(metadata, column); + + CellParser parser = CellParserFactory.create(datatype); Optional name = Models.getProperty(metadata, column, CSVW.NAME); if (!name.isPresent()) { @@ -224,13 +225,9 @@ private Parser getCellParser(Model metadata, Resource column) { parser.setDefaultValue(defaultVal.get().stringValue()); } - // Optional dataType = Models.getProperty(metadata, column, CSVW.DATATYPE); - // parser.setDataType((IRI) dataType.orElse(XSD.STRING.getIri())); - Optional propertyURL = Models.getProperty(metadata, column, CSVW.PROPERTY_URL); - if (propertyURL.isPresent()) { - parser.setPropertyURL(metadata.getNamespaces(), propertyURL.get().stringValue()); - } + String s = propertyURL.isPresent() ? propertyURL.get().stringValue() : "_local:" + parser.getName(); + parser.setPropertyURL(metadata.getNamespaces(), s); Optional valueURL = Models.getProperty(metadata, column, CSVW.VALUE_URL); if (valueURL.isPresent()) { @@ -239,8 +236,30 @@ private Parser getCellParser(Model metadata, Resource column) { return parser; } - private IRI getDataType(Model metadata, Value col) { - return XSD.STRING.getIri(); + /** + * Get IRI of base or derived datatype + * + * @param metadata + * @param column + * @return + */ + private IRI getDatatypeIRI(Model metadata, Resource column) { + Optional val = Models.getProperty(metadata, column, CSVW.DATATYPE); + if (val.isPresent()) { + Value datatype = val.get(); + // derived datatype + if (datatype.isBNode()) { + val = Models.getProperty(metadata, (Resource) datatype, CSVW.BASE); + } + } + if (!val.isPresent()) { + return XSD.STRING.getIri(); + } + Value datatype = val.get(); + if (datatype.isIRI()) { + return (IRI) datatype; + } + return XSD.valueOf(datatype.stringValue().toUpperCase()).getIri(); } /** @@ -261,7 +280,7 @@ private String getAboutURL(Model metadata, Resource subject) { * @param cellParsers * @return 0-based index or -1 */ - private int getAboutIndex(String aboutURL, Parser[] cellParsers) { + private int getAboutIndex(String aboutURL, CellParser[] cellParsers) { if (aboutURL == null || aboutURL.isEmpty()) { return -1; } @@ -284,7 +303,7 @@ private int getAboutIndex(String aboutURL, Parser[] cellParsers) { * @param aboutURL * @param aboutIndex */ - private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, Parser[] cellParsers, Resource table) { + private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParser[] cellParsers, Resource table) { String aboutURL = getAboutURL(metadata, table); // check for placeholder / column name that's being used to create subject IRI @@ -317,6 +336,13 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, Parser[] } } + /** + * Get configured CSV file reader + * + * @param metadata + * @param reader + * @return + */ private CSVReader getCSVReader(Model metadata, Reader reader) { CSVParser parser = new CSVParserBuilder().build(); return new CSVReaderBuilder(reader).withSkipLines(1).withCSVParser(parser).build(); @@ -329,7 +355,7 @@ private CSVReader getCSVReader(Model metadata, Reader reader) { * @param aboutURL * @param aboutIndex */ - private Resource getIRIorBnode(Parser[] cellParsers, String[] cells, String aboutURL, int aboutIndex, + private Resource getIRIorBnode(CellParser[] cellParsers, String[] cells, String aboutURL, int aboutIndex, String placeholder) { if (aboutIndex > -1) { Value val = cellParsers[aboutIndex].parse(cells[aboutIndex]); diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParser.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParser.java index 1b481a74a1..a6957f5f1f 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParser.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParser.java @@ -10,40 +10,54 @@ *******************************************************************************/ package org.eclipse.rdf4j.rio.csvw.parsers; +import java.util.Set; + +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Namespace; +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.util.Literals; +import org.eclipse.rdf4j.model.util.Values; import org.eclipse.rdf4j.rio.RDFParseException; /** * - * @author Bart.Hanssens + * @author Bart Hanssens */ -public class CellParser { - private T minValue; - private T maxValue; - private T defaultValue; +public class CellParser { + private String name; + private IRI dataType; + private String defaultValue; private boolean isRequired; private String format; - private String propertyUrl; + private IRI propertyIRI; private String valueUrl; private String separator; /** - * @param minValue the minValue to set + * @param name + */ + public void setName(String name) { + this.name = name; + } + + /** + * @return name */ - public void setMinValue(T minValue) { - this.minValue = minValue; + public String getName() { + return name; } /** - * @param maxValue the maxValue to set + * @param dataType */ - public void setMaxValue(T maxValue) { - this.maxValue = maxValue; + public void setDataType(IRI dataType) { + this.dataType = dataType; } /** * @param defaultValue the defaultValue to set */ - public void setDefaultValue(T defaultValue) { + public void setDefaultValue(String defaultValue) { this.defaultValue = defaultValue; } @@ -62,30 +76,42 @@ public void setFormat(String format) { } /** - * @return the propertyUrl + * @return the propertyUrl as IRI */ - public String getPropertyUrl() { - return propertyUrl; + public IRI getPropertyIRI() { + return propertyIRI; } /** + * Set property URL (predicate IRI) + * + * @param namespaces set of namespaces * @param propertyUrl the propertyUrl to set */ - public void setPropertyUrl(String propertyUrl) { - this.propertyUrl = propertyUrl; + public void setPropertyURL(Set namespaces, String propertyUrl) { + this.propertyIRI = Values.iri(namespaces, propertyUrl); + } + + /** + * Set property URL (predicate IRI) relative to document + * + * @param propertyUrl the propertyUrl to set + */ + public void setPropertyURL(String propertyUrl) { + this.propertyIRI = Values.iri("", propertyUrl); } /** * @return the valueUrl */ - public String getValueUrl() { + public String getValueURL() { return valueUrl; } /** * @param valueUrl the valueUrl to set */ - public void setValueUrl(String valueUrl) { + public void setValueURL(String valueUrl) { this.valueUrl = valueUrl; } @@ -103,12 +129,22 @@ public void setSeparator(String separator) { this.separator = separator; } - public T parse(Object cell) { - return (T) cell; - } - - public void validate(T value) throws RDFParseException { - + /** + * Get the value from a cell + * + * @param cell + * @return + */ + public Value parse(String cell) { + String s = cell; + if ((s == null || s.isEmpty()) && (defaultValue != null)) { + s = defaultValue; + } + if (valueUrl != null && s != null) { + return Values.iri(valueUrl.replace("{" + name + "}", s)); + } + + return Values.literal(s, dataType); } } diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserFactory.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserFactory.java index 035f1992a0..141f18a96b 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserFactory.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserFactory.java @@ -10,75 +10,21 @@ *******************************************************************************/ package org.eclipse.rdf4j.rio.csvw.parsers; -import static org.eclipse.rdf4j.model.base.CoreDatatype.XSD.STRING; - -import java.util.Optional; - -import org.eclipse.rdf4j.model.BNode; -import org.eclipse.rdf4j.model.Literal; -import org.eclipse.rdf4j.model.Model; -import org.eclipse.rdf4j.model.Resource; -import org.eclipse.rdf4j.model.Value; -import org.eclipse.rdf4j.model.base.CoreDatatype.XSD; -import org.eclipse.rdf4j.model.util.Models; -import org.eclipse.rdf4j.model.vocabulary.CSVW; -import org.eclipse.rdf4j.rio.RDFParseException; +import org.eclipse.rdf4j.model.IRI; /** * - * @author Bart.Hanssens + * @author Bart Hanssens */ public class CellParserFactory { - - private static XSD getDataType(Model model, Resource column) { - Optional val = Models.getProperty(model, column, CSVW.DATATYPE); - if (!val.isPresent()) { - return XSD.STRING; - } - Value v = val.get(); - XSD datatype = null; - - if (v instanceof Literal) { - datatype = XSD.valueOf(v.stringValue().toUpperCase()); - } - if (v instanceof Resource) { - val = Models.getProperty(model, (Resource) v, CSVW.BASE); - if (!val.isPresent()) { - return XSD.STRING; - } - v = val.get(); - datatype = XSD.valueOf(v.stringValue().toUpperCase()); - } - if (datatype == null) { - throw new RDFParseException("Could not parse datatype of column"); - } - return datatype; - } - /** - * Create a CellParser based on the (JSON-LD) metadata of a column - * - * @param Model - * @return + * Create a new CellParser based on datatype + * @param datatype + * @return */ - public static CellParser fromMetadata(Model model, Resource column) { - CellParser parser; - - XSD dataType = getDataType(model, column); - switch (dataType) { - case STRING: - parser = new CellParser(); - break; - case BOOLEAN: - parser = new CellParser(); - break; - case INTEGER: - parser = new CellParser(); - break; - default: - parser = new CellParser(); - break; - } - return parser; + public static CellParser create(IRI datatype) { + CellParser p = new CellParser(); + p.setDataType(datatype); + return p; } } diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/Parser.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/Parser.java deleted file mode 100644 index 564fdbb024..0000000000 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/Parser.java +++ /dev/null @@ -1,139 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2024 Eclipse RDF4J contributors. - * - * All rights reserved. This program and the accompanying materials - * are made available under the terms of the Eclipse Distribution License v1.0 - * which accompanies this distribution, and is available at - * http://www.eclipse.org/org/documents/edl-v10.php. - * - * SPDX-License-Identifier: BSD-3-Clause - *******************************************************************************/ -package org.eclipse.rdf4j.rio.csvw.parsers; - -import java.util.Set; - -import org.eclipse.rdf4j.model.IRI; -import org.eclipse.rdf4j.model.Namespace; -import org.eclipse.rdf4j.model.Value; -import org.eclipse.rdf4j.model.util.Literals; -import org.eclipse.rdf4j.model.util.Values; -import org.eclipse.rdf4j.rio.RDFParseException; - -/** - * - * @author Bart.Hanssens - */ -public class Parser { - private String name; - private IRI dataType; - private String defaultValue; - private boolean isRequired; - private String format; - private IRI propertyIRI; - private String valueUrl; - private String separator; - - /** - * @param name - */ - public void setName(String name) { - this.name = name; - } - - /** - * @return name - */ - public String getName() { - return name; - } - - /** - * @param dataType - */ - public void setDataType(IRI dataType) { - this.dataType = dataType; - } - - /** - * @param defaultValue the defaultValue to set - */ - public void setDefaultValue(String defaultValue) { - this.defaultValue = defaultValue; - } - - /** - * @param isRequired the isRequired to set - */ - public void setIsRequired(boolean isRequired) { - this.isRequired = isRequired; - } - - /** - * @param format the format to set - */ - public void setFormat(String format) { - this.format = format; - } - - /** - * @return the propertyUrl as IRI - */ - public IRI getPropertyIRI() { - return propertyIRI; - } - - /** - * @param namespaces set of namespaces - * @param propertyUrl the propertyUrl to set - */ - public void setPropertyURL(Set namespaces, String propertyUrl) { - this.propertyIRI = Values.iri(namespaces, propertyUrl); - } - - /** - * @return the valueUrl - */ - public String getValueURL() { - return valueUrl; - } - - /** - * @param valueUrl the valueUrl to set - */ - public void setValueURL(String valueUrl) { - this.valueUrl = valueUrl; - } - - /** - * @return the separator - */ - public String getSeparator() { - return separator; - } - - /** - * @param separator the separator to set - */ - public void setSeparator(String separator) { - this.separator = separator; - } - - /** - * Get the value from a cell - * - * @param cell - * @return - */ - public Value parse(String cell) { - String s = cell; - if ((s == null || s.isEmpty()) && (defaultValue != null)) { - s = defaultValue; - } - if (valueUrl != null && s != null) { - return Values.iri(valueUrl.replace("{" + name + "}", s)); - } - - return Values.literal(s, dataType); - } - -}