Skip to content

Commit

Permalink
eclipse-rdf4jGH-5058: additional parser code (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
barthanssens committed Jul 11, 2024
1 parent a56cb4b commit 43aa09b
Show file tree
Hide file tree
Showing 8 changed files with 78 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ public class CSVW {
/** csvw:dialect */
public static final IRI DIALECT;

/** csvw:encoding */
public static final IRI ENCODING;

/** csvw:format */
public static final IRI FORMAT;

Expand All @@ -81,6 +84,9 @@ public class CSVW {
/** csvw:propertyUrl */
public static final IRI PROPERTY_URL;

/** csvw:quoteChar */
public static final IRI QUOTE_CHAR;

/** csvw:required */
public static final IRI REQUIRED;

Expand Down Expand Up @@ -111,12 +117,14 @@ public class CSVW {
DEFAULT = Vocabularies.createIRI(NAMESPACE, "default");
DELIMITER = Vocabularies.createIRI(NAMESPACE, "delimiter");
DIALECT = Vocabularies.createIRI(NAMESPACE, "dialect");
ENCODING = Vocabularies.createIRI(NAMESPACE, "encoding");
FORMAT = Vocabularies.createIRI(NAMESPACE, "format");
GROUP_CHAR = Vocabularies.createIRI(NAMESPACE, "groupChar");
HEADER = Vocabularies.createIRI(NAMESPACE, "header");
LANG = Vocabularies.createIRI(NAMESPACE, "lang");
NAME = Vocabularies.createIRI(NAMESPACE, "name");
PROPERTY_URL = Vocabularies.createIRI(NAMESPACE, "propertyUrl");
QUOTE_CHAR = Vocabularies.createIRI(NAMESPACE, "quoteChar");
REQUIRED = Vocabularies.createIRI(NAMESPACE, "required");
TABLE_SCHEMA = Vocabularies.createIRI(NAMESPACE, "tableSchema");
TABLES = Vocabularies.createIRI(NAMESPACE, "tables");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,15 @@
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;

import org.apache.commons.lang3.CharSet;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.Resource;
Expand Down Expand Up @@ -218,12 +221,12 @@ private CellParser getCellParser(Model metadata, Resource column) {
getFormat(metadata, column).ifPresent(v -> parser.setFormat(v.stringValue()));

Models.getPropertyString(metadata, column, CSVW.NAME)
.ifPresentOrElse(v -> parser.setName(v,
.ifPresentOrElse(v -> parser.setName(v),
() -> new RDFParseException("Metadata file does not contain name for column " + column));

Models.getPropertyString(metadata, column, CSVW.DEFAULT).ifPresent(v -> parser.setDefaultValue(v);
Models.getPropertyString(metadata, column, CSVW.DEFAULT).ifPresent(v -> parser.setDefaultValue(v));
Models.getPropertyString(metadata, column, CSVW.REQUIRED)
.ifPresent(v -> parser.setIsRequired(Boolean.parseBoolean(v));
.ifPresent(v -> parser.setIsRequired(Boolean.parseBoolean(v)));
Models.getPropertyString(metadata, column, CSVW.VALUE_URL).ifPresent(v -> parser.setValueURL(v));

// use a property from a vocabulary as predicate, or create a property relative to the namespace of the CSV
Expand Down Expand Up @@ -261,7 +264,7 @@ private IRI getDatatypeIRI(Model metadata, Resource column) {
}

/**
* Get name of the generic datatype or more specific datatype
* Get name of the generic datatype or more specific datatype
*
* @param metadata
* @param column
Expand Down Expand Up @@ -329,10 +332,12 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse
String placeholder = (aboutIndex > -1) ? cellParsers[aboutIndex].getName() : null;

LOGGER.info("Parsing {}", csvFile);


Charset encoding = getEncoding(metadata, table);

long line = 0;
try (InputStream is = csvFile.toURL().openStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
BufferedReader reader = new BufferedReader(new InputStreamReader(is, encoding));
CSVReader csv = getCSVReader(metadata, table, reader)) {

String[] cells;
Expand Down Expand Up @@ -363,15 +368,38 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse
* @return
*/
private CSVReader getCSVReader(Model metadata, Resource table, Reader reader) {
CSVParser parser = new CSVParserBuilder().build();
CSVParserBuilder parserBuilder = new CSVParserBuilder();
CSVReaderBuilder builder = new CSVReaderBuilder(reader);

Optional<Value> dialect = Models.getProperty(metadata, table, CSVW.DIALECT);
if (dialect.isPresent()) {
Models.getPropertyString(metadata, (Resource) dialect, CSVW.DELIMITER);
Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.DELIMITER)
.ifPresent(v -> parserBuilder.withSeparator(v.charAt(0)));
Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.HEADER)
.ifPresent(v -> builder.withSkipLines(v.equalsIgnoreCase("false") ? 1 : 0));
Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.QUOTE_CHAR)
.ifPresent(v -> parserBuilder.withQuoteChar(v.charAt(0)));
}

return new CSVReaderBuilder(reader).withCSVParser(parserBuilder.build()).build();
}

/**
* Get charset of the CSV, by default this should be UTF-8
*
* @param metadata
* @param table
* @return charset
*/
private Charset getEncoding(Model metadata, Resource table) {
Optional<Value> dialect = Models.getProperty(metadata, table, CSVW.DIALECT);
if (dialect.isPresent()) {
Optional<String> encoding = Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.ENCODING);
if (encoding.isPresent()) {
return Charset.forName(encoding.get());
}
}

return new CSVReaderBuilder(reader).withSkipLines(1).withCSVParser(parser).build();
return StandardCharsets.UTF_8;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ public void setDecimalChar(String decimalChar) {
this.decimalChar = decimalChar;
}


/**
* @return the group character
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public void setFormat(String format) {
@Override
public Value parse(String cell) {
String s = getValueOrDefault(cell);

return Values.literal(valueTrue.equals(s) ? "true" : "false", dataType);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public void setFormat(String format) {
@Override
public Value parse(String cell) {
String s = getValueOrDefault(cell);

if (formatter != null) {
s = DateTimeFormatter.ISO_DATE.format(formatter.parse(s));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
*
* @author Bart Hanssens
*/
public class CellParserDouble extends CellParser {
public class CellParserDecimal extends CellParser {

@Override
public Value parse(String cell) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,27 @@ public static CellParser create(IRI datatype) {
p = new CellParserString();
} else {
switch (xsdType) {
case BOOLEAN:
p = new CellParserBoolean();
break;
case INTEGER:
case INT:
case SHORT:
case LONG:
p = new CellParserLong();
break;
case FLOAT:
case DOUBLE:
p = new CellParserDouble();
p.setDecimalChar(".");
break;
case DATE:
case DATETIME:
p = new CellParserDate();
break;
default:
p = new CellParserString();
case BOOLEAN:
p = new CellParserBoolean();
break;
case INTEGER:
case INT:
case SHORT:
case LONG:
p = new CellParserInteger();
break;
case FLOAT:
case DOUBLE:
p = new CellParserDecimal();
p.setDecimalChar(".");
break;
case DATE:
case DATETIME:
case TIME:
p = new CellParserDate();
break;
default:
p = new CellParserString();
}
}
p.setDataType(datatype);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,35 +10,24 @@
*******************************************************************************/
package org.eclipse.rdf4j.rio.csvw.parsers;

import java.util.Set;

import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Namespace;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.util.Values;

/**
*
* @author Bart Hanssens
*/
public class CellParserBoolean extends CellParser {
private String valueTrue;
private String valueFalse;

@Override
public void setFormat(String format) {
String[] values = format.split("\\|");
valueTrue = values[0];
valueFalse = values[1];
}
public class CellParserInteger extends CellParser {

@Override
public Value parse(String cell) {
String s = cell;
if ((s == null || s.isEmpty()) && (defaultValue != null)) {
s = defaultValue;
String s = getValueOrDefault(cell);

if (s != null && groupChar != null) {
s = s.replace(groupChar, "");
}
return Values.literal(valueTrue.equals(s) ? "true" : "false", dataType);

return Values.literal(s, dataType);
}

}

0 comments on commit 43aa09b

Please sign in to comment.