Skip to content

Commit

Permalink
eclipse-rdf4jGH-5058: additional parser code (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
barthanssens committed Jul 9, 2024
1 parent 421273c commit 8a7544d
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 251 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,10 @@
import java.io.Reader;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
Expand All @@ -45,7 +40,8 @@
import org.eclipse.rdf4j.rio.RDFHandlerException;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.rio.csvw.parsers.Parser;
import org.eclipse.rdf4j.rio.csvw.parsers.CellParserFactory;
import org.eclipse.rdf4j.rio.csvw.parsers.CellParser;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -90,13 +86,15 @@ public synchronized void parse(InputStream in, String baseURI)
if (csvFile == null) {
throw new RDFParseException("Could not find URL");
}
metadata.getNamespaces().add(new SimpleNamespace("", csvFile.toString() + "#"));
// add dummy namespace for resolving unspecified column names / predicates relative to CSV file
metadata.getNamespaces().add(new SimpleNamespace("_local", csvFile.toString() + "#"));

Resource tableSchema = getTableSchema(metadata, (Resource) table);
List<Value> columns = getColumns(metadata, tableSchema);
Parser[] cellParsers = columns.stream()
CellParser[] cellParsers = columns.stream()
.map(c -> getCellParser(metadata, (Resource) c))
.collect(Collectors.toList())
.toArray(new Parser[columns.size()]);
.toArray(new CellParser[columns.size()]);

parseCSV(metadata, rdfHandler, csvFile, cellParsers, (Resource) table);
}
Expand Down Expand Up @@ -205,13 +203,16 @@ private List<Value> getColumns(Model metadata, Resource tableSchema) throws RDFP
}

/**
* Get parser for specific column
*
* @param metadata
* @param table
* @param column
* @return
*/
private Parser getCellParser(Model metadata, Resource column) {
Parser parser = new Parser();
private CellParser getCellParser(Model metadata, Resource column) {
IRI datatype = getDatatypeIRI(metadata, column);

CellParser parser = CellParserFactory.create(datatype);

Optional<Value> name = Models.getProperty(metadata, column, CSVW.NAME);
if (!name.isPresent()) {
Expand All @@ -224,13 +225,9 @@ private Parser getCellParser(Model metadata, Resource column) {
parser.setDefaultValue(defaultVal.get().stringValue());
}

// Optional<Value> dataType = Models.getProperty(metadata, column, CSVW.DATATYPE);
// parser.setDataType((IRI) dataType.orElse(XSD.STRING.getIri()));

Optional<Value> propertyURL = Models.getProperty(metadata, column, CSVW.PROPERTY_URL);
if (propertyURL.isPresent()) {
parser.setPropertyURL(metadata.getNamespaces(), propertyURL.get().stringValue());
}
String s = propertyURL.isPresent() ? propertyURL.get().stringValue() : "_local:" + parser.getName();
parser.setPropertyURL(metadata.getNamespaces(), s);

Optional<Value> valueURL = Models.getProperty(metadata, column, CSVW.VALUE_URL);
if (valueURL.isPresent()) {
Expand All @@ -239,8 +236,30 @@ private Parser getCellParser(Model metadata, Resource column) {
return parser;
}

private IRI getDataType(Model metadata, Value col) {
return XSD.STRING.getIri();
/**
* Get IRI of base or derived datatype
*
* @param metadata
* @param column
* @return
*/
private IRI getDatatypeIRI(Model metadata, Resource column) {
Optional<Value> val = Models.getProperty(metadata, column, CSVW.DATATYPE);
if (val.isPresent()) {
Value datatype = val.get();
// derived datatype
if (datatype.isBNode()) {
val = Models.getProperty(metadata, (Resource) datatype, CSVW.BASE);
}
}
if (!val.isPresent()) {
return XSD.STRING.getIri();
}
Value datatype = val.get();
if (datatype.isIRI()) {
return (IRI) datatype;
}
return XSD.valueOf(datatype.stringValue().toUpperCase()).getIri();
}

/**
Expand All @@ -261,7 +280,7 @@ private String getAboutURL(Model metadata, Resource subject) {
* @param cellParsers
* @return 0-based index or -1
*/
private int getAboutIndex(String aboutURL, Parser[] cellParsers) {
private int getAboutIndex(String aboutURL, CellParser[] cellParsers) {
if (aboutURL == null || aboutURL.isEmpty()) {
return -1;
}
Expand All @@ -284,7 +303,7 @@ private int getAboutIndex(String aboutURL, Parser[] cellParsers) {
* @param aboutURL
* @param aboutIndex
*/
private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, Parser[] cellParsers, Resource table) {
private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParser[] cellParsers, Resource table) {
String aboutURL = getAboutURL(metadata, table);

// check for placeholder / column name that's being used to create subject IRI
Expand Down Expand Up @@ -317,6 +336,13 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, Parser[]
}
}

/**
* Get configured CSV file reader
*
* @param metadata
* @param reader
* @return
*/
private CSVReader getCSVReader(Model metadata, Reader reader) {
CSVParser parser = new CSVParserBuilder().build();
return new CSVReaderBuilder(reader).withSkipLines(1).withCSVParser(parser).build();
Expand All @@ -329,7 +355,7 @@ private CSVReader getCSVReader(Model metadata, Reader reader) {
* @param aboutURL
* @param aboutIndex
*/
private Resource getIRIorBnode(Parser[] cellParsers, String[] cells, String aboutURL, int aboutIndex,
private Resource getIRIorBnode(CellParser[] cellParsers, String[] cells, String aboutURL, int aboutIndex,
String placeholder) {
if (aboutIndex > -1) {
Value val = cellParsers[aboutIndex].parse(cells[aboutIndex]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,40 +10,54 @@
*******************************************************************************/
package org.eclipse.rdf4j.rio.csvw.parsers;

import java.util.Set;

import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Namespace;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.util.Literals;
import org.eclipse.rdf4j.model.util.Values;
import org.eclipse.rdf4j.rio.RDFParseException;

/**
*
* @author Bart.Hanssens
* @author Bart Hanssens
*/
public class CellParser<T> {
private T minValue;
private T maxValue;
private T defaultValue;
public class CellParser {
private String name;
private IRI dataType;
private String defaultValue;
private boolean isRequired;
private String format;
private String propertyUrl;
private IRI propertyIRI;
private String valueUrl;
private String separator;

/**
* @param minValue the minValue to set
* @param name
*/
public void setName(String name) {
this.name = name;
}

/**
* @return name
*/
public void setMinValue(T minValue) {
this.minValue = minValue;
public String getName() {
return name;
}

/**
* @param maxValue the maxValue to set
* @param dataType
*/
public void setMaxValue(T maxValue) {
this.maxValue = maxValue;
public void setDataType(IRI dataType) {
this.dataType = dataType;
}

/**
* @param defaultValue the defaultValue to set
*/
public void setDefaultValue(T defaultValue) {
public void setDefaultValue(String defaultValue) {
this.defaultValue = defaultValue;
}

Expand All @@ -62,30 +76,42 @@ public void setFormat(String format) {
}

/**
* @return the propertyUrl
* @return the propertyUrl as IRI
*/
public String getPropertyUrl() {
return propertyUrl;
public IRI getPropertyIRI() {
return propertyIRI;
}

/**
* Set property URL (predicate IRI)
*
* @param namespaces set of namespaces
* @param propertyUrl the propertyUrl to set
*/
public void setPropertyUrl(String propertyUrl) {
this.propertyUrl = propertyUrl;
public void setPropertyURL(Set<Namespace> namespaces, String propertyUrl) {
this.propertyIRI = Values.iri(namespaces, propertyUrl);
}

/**
* Set property URL (predicate IRI) relative to document
*
* @param propertyUrl the propertyUrl to set
*/
public void setPropertyURL(String propertyUrl) {
this.propertyIRI = Values.iri("", propertyUrl);
}

/**
* @return the valueUrl
*/
public String getValueUrl() {
public String getValueURL() {
return valueUrl;
}

/**
* @param valueUrl the valueUrl to set
*/
public void setValueUrl(String valueUrl) {
public void setValueURL(String valueUrl) {
this.valueUrl = valueUrl;
}

Expand All @@ -103,12 +129,22 @@ public void setSeparator(String separator) {
this.separator = separator;
}

public T parse(Object cell) {
return (T) cell;
}

public void validate(T value) throws RDFParseException {

/**
* Get the value from a cell
*
* @param cell
* @return
*/
public Value parse(String cell) {
String s = cell;
if ((s == null || s.isEmpty()) && (defaultValue != null)) {
s = defaultValue;
}
if (valueUrl != null && s != null) {
return Values.iri(valueUrl.replace("{" + name + "}", s));
}

return Values.literal(s, dataType);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -10,75 +10,21 @@
*******************************************************************************/
package org.eclipse.rdf4j.rio.csvw.parsers;

import static org.eclipse.rdf4j.model.base.CoreDatatype.XSD.STRING;

import java.util.Optional;

import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.base.CoreDatatype.XSD;
import org.eclipse.rdf4j.model.util.Models;
import org.eclipse.rdf4j.model.vocabulary.CSVW;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.model.IRI;

/**
*
* @author Bart.Hanssens
* @author Bart Hanssens
*/
public class CellParserFactory {

private static XSD getDataType(Model model, Resource column) {
Optional<Value> val = Models.getProperty(model, column, CSVW.DATATYPE);
if (!val.isPresent()) {
return XSD.STRING;
}
Value v = val.get();
XSD datatype = null;

if (v instanceof Literal) {
datatype = XSD.valueOf(v.stringValue().toUpperCase());
}
if (v instanceof Resource) {
val = Models.getProperty(model, (Resource) v, CSVW.BASE);
if (!val.isPresent()) {
return XSD.STRING;
}
v = val.get();
datatype = XSD.valueOf(v.stringValue().toUpperCase());
}
if (datatype == null) {
throw new RDFParseException("Could not parse datatype of column");
}
return datatype;
}

/**
* Create a CellParser based on the (JSON-LD) metadata of a column
*
* @param Model
* @return
* Create a new CellParser based on datatype
* @param datatype
* @return
*/
public static CellParser fromMetadata(Model model, Resource column) {
CellParser parser;

XSD dataType = getDataType(model, column);
switch (dataType) {
case STRING:
parser = new CellParser<String>();
break;
case BOOLEAN:
parser = new CellParser<Boolean>();
break;
case INTEGER:
parser = new CellParser<Integer>();
break;
default:
parser = new CellParser<String>();
break;
}
return parser;
public static CellParser create(IRI datatype) {
CellParser p = new CellParser();
p.setDataType(datatype);
return p;
}
}
Loading

0 comments on commit 8a7544d

Please sign in to comment.