From 43aa09b49f30fa314c0c7481c6304760bfa8cb7e Mon Sep 17 00:00:00 2001 From: Bart Hanssens Date: Thu, 11 Jul 2024 13:53:35 +0100 Subject: [PATCH] GH-5058: additional parser code (WIP) --- .../eclipse/rdf4j/model/vocabulary/CSVW.java | 8 +++ .../eclipse/rdf4j/rio/csvw/CSVWParser.java | 50 +++++++++++++++---- .../rdf4j/rio/csvw/parsers/CellParser.java | 1 - .../rio/csvw/parsers/CellParserBoolean.java | 2 +- .../rio/csvw/parsers/CellParserDate.java | 2 +- ...rserDouble.java => CellParserDecimal.java} | 2 +- .../rio/csvw/parsers/CellParserFactory.java | 41 +++++++-------- ...ParserLOng.java => CellParserInteger.java} | 25 +++------- 8 files changed, 78 insertions(+), 53 deletions(-) rename core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/{CellParserDouble.java => CellParserDecimal.java} (95%) rename core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/{CellParserLOng.java => CellParserInteger.java} (58%) diff --git a/core/model-vocabulary/src/main/java/org/eclipse/rdf4j/model/vocabulary/CSVW.java b/core/model-vocabulary/src/main/java/org/eclipse/rdf4j/model/vocabulary/CSVW.java index 827477f1cd..603812446b 100644 --- a/core/model-vocabulary/src/main/java/org/eclipse/rdf4j/model/vocabulary/CSVW.java +++ b/core/model-vocabulary/src/main/java/org/eclipse/rdf4j/model/vocabulary/CSVW.java @@ -63,6 +63,9 @@ public class CSVW { /** csvw:dialect */ public static final IRI DIALECT; + /** csvw:encoding */ + public static final IRI ENCODING; + /** csvw:format */ public static final IRI FORMAT; @@ -81,6 +84,9 @@ public class CSVW { /** csvw:propertyUrl */ public static final IRI PROPERTY_URL; + /** csvw:quoteChar */ + public static final IRI QUOTE_CHAR; + /** csvw:required */ public static final IRI REQUIRED; @@ -111,12 +117,14 @@ public class CSVW { DEFAULT = Vocabularies.createIRI(NAMESPACE, "default"); DELIMITER = Vocabularies.createIRI(NAMESPACE, "delimiter"); DIALECT = Vocabularies.createIRI(NAMESPACE, "dialect"); + ENCODING = Vocabularies.createIRI(NAMESPACE, "encoding"); FORMAT = Vocabularies.createIRI(NAMESPACE, "format"); GROUP_CHAR = Vocabularies.createIRI(NAMESPACE, "groupChar"); HEADER = Vocabularies.createIRI(NAMESPACE, "header"); LANG = Vocabularies.createIRI(NAMESPACE, "lang"); NAME = Vocabularies.createIRI(NAMESPACE, "name"); PROPERTY_URL = Vocabularies.createIRI(NAMESPACE, "propertyUrl"); + QUOTE_CHAR = Vocabularies.createIRI(NAMESPACE, "quoteChar"); REQUIRED = Vocabularies.createIRI(NAMESPACE, "required"); TABLE_SCHEMA = Vocabularies.createIRI(NAMESPACE, "tableSchema"); TABLES = Vocabularies.createIRI(NAMESPACE, "tables"); diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java index 62a1fc08ef..3ffe653a14 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java @@ -16,12 +16,15 @@ import java.io.InputStreamReader; import java.io.Reader; import java.net.URI; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; +import org.apache.commons.lang3.CharSet; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Model; import org.eclipse.rdf4j.model.Resource; @@ -218,12 +221,12 @@ private CellParser getCellParser(Model metadata, Resource column) { getFormat(metadata, column).ifPresent(v -> parser.setFormat(v.stringValue())); Models.getPropertyString(metadata, column, CSVW.NAME) - .ifPresentOrElse(v -> parser.setName(v, + .ifPresentOrElse(v -> parser.setName(v), () -> new RDFParseException("Metadata file does not contain name for column " + column)); - Models.getPropertyString(metadata, column, CSVW.DEFAULT).ifPresent(v -> parser.setDefaultValue(v); + Models.getPropertyString(metadata, column, CSVW.DEFAULT).ifPresent(v -> parser.setDefaultValue(v)); Models.getPropertyString(metadata, column, CSVW.REQUIRED) - .ifPresent(v -> parser.setIsRequired(Boolean.parseBoolean(v)); + .ifPresent(v -> parser.setIsRequired(Boolean.parseBoolean(v))); Models.getPropertyString(metadata, column, CSVW.VALUE_URL).ifPresent(v -> parser.setValueURL(v)); // use a property from a vocabulary as predicate, or create a property relative to the namespace of the CSV @@ -261,7 +264,7 @@ private IRI getDatatypeIRI(Model metadata, Resource column) { } /** - * Get name of the generic datatype or more specific datatype + * Get name of the generic datatype or more specific datatype * * @param metadata * @param column @@ -329,10 +332,12 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse String placeholder = (aboutIndex > -1) ? cellParsers[aboutIndex].getName() : null; LOGGER.info("Parsing {}", csvFile); - + + Charset encoding = getEncoding(metadata, table); + long line = 0; try (InputStream is = csvFile.toURL().openStream(); - BufferedReader reader = new BufferedReader(new InputStreamReader(is)); + BufferedReader reader = new BufferedReader(new InputStreamReader(is, encoding)); CSVReader csv = getCSVReader(metadata, table, reader)) { String[] cells; @@ -363,15 +368,38 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse * @return */ private CSVReader getCSVReader(Model metadata, Resource table, Reader reader) { - CSVParser parser = new CSVParserBuilder().build(); + CSVParserBuilder parserBuilder = new CSVParserBuilder(); CSVReaderBuilder builder = new CSVReaderBuilder(reader); - + Optional dialect = Models.getProperty(metadata, table, CSVW.DIALECT); if (dialect.isPresent()) { - Models.getPropertyString(metadata, (Resource) dialect, CSVW.DELIMITER); + Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.DELIMITER) + .ifPresent(v -> parserBuilder.withSeparator(v.charAt(0))); + Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.HEADER) + .ifPresent(v -> builder.withSkipLines(v.equalsIgnoreCase("false") ? 1 : 0)); + Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.QUOTE_CHAR) + .ifPresent(v -> parserBuilder.withQuoteChar(v.charAt(0))); + } + + return new CSVReaderBuilder(reader).withCSVParser(parserBuilder.build()).build(); + } + + /** + * Get charset of the CSV, by default this should be UTF-8 + * + * @param metadata + * @param table + * @return charset + */ + private Charset getEncoding(Model metadata, Resource table) { + Optional dialect = Models.getProperty(metadata, table, CSVW.DIALECT); + if (dialect.isPresent()) { + Optional encoding = Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.ENCODING); + if (encoding.isPresent()) { + return Charset.forName(encoding.get()); + } } - - return new CSVReaderBuilder(reader).withSkipLines(1).withCSVParser(parser).build(); + return StandardCharsets.UTF_8; } /** diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParser.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParser.java index 99e99ca902..80389f8f30 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParser.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParser.java @@ -146,7 +146,6 @@ public void setDecimalChar(String decimalChar) { this.decimalChar = decimalChar; } - /** * @return the group character */ diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserBoolean.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserBoolean.java index c99ce66e5d..c5889b5074 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserBoolean.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserBoolean.java @@ -35,7 +35,7 @@ public void setFormat(String format) { @Override public Value parse(String cell) { String s = getValueOrDefault(cell); - + return Values.literal(valueTrue.equals(s) ? "true" : "false", dataType); } diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDate.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDate.java index e7416c2cc5..1b25e8e097 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDate.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDate.java @@ -34,7 +34,7 @@ public void setFormat(String format) { @Override public Value parse(String cell) { String s = getValueOrDefault(cell); - + if (formatter != null) { s = DateTimeFormatter.ISO_DATE.format(formatter.parse(s)); } diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDouble.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDecimal.java similarity index 95% rename from core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDouble.java rename to core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDecimal.java index 3000787e80..fde579392e 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDouble.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDecimal.java @@ -17,7 +17,7 @@ * * @author Bart Hanssens */ -public class CellParserDouble extends CellParser { +public class CellParserDecimal extends CellParser { @Override public Value parse(String cell) { diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserFactory.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserFactory.java index ce6c9e3c95..d9f3526c30 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserFactory.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserFactory.java @@ -32,26 +32,27 @@ public static CellParser create(IRI datatype) { p = new CellParserString(); } else { switch (xsdType) { - case BOOLEAN: - p = new CellParserBoolean(); - break; - case INTEGER: - case INT: - case SHORT: - case LONG: - p = new CellParserLong(); - break; - case FLOAT: - case DOUBLE: - p = new CellParserDouble(); - p.setDecimalChar("."); - break; - case DATE: - case DATETIME: - p = new CellParserDate(); - break; - default: - p = new CellParserString(); + case BOOLEAN: + p = new CellParserBoolean(); + break; + case INTEGER: + case INT: + case SHORT: + case LONG: + p = new CellParserInteger(); + break; + case FLOAT: + case DOUBLE: + p = new CellParserDecimal(); + p.setDecimalChar("."); + break; + case DATE: + case DATETIME: + case TIME: + p = new CellParserDate(); + break; + default: + p = new CellParserString(); } } p.setDataType(datatype); diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserLOng.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserInteger.java similarity index 58% rename from core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserLOng.java rename to core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserInteger.java index 41a4094a90..c5ca664f2a 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserLOng.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserInteger.java @@ -10,10 +10,6 @@ *******************************************************************************/ package org.eclipse.rdf4j.rio.csvw.parsers; -import java.util.Set; - -import org.eclipse.rdf4j.model.IRI; -import org.eclipse.rdf4j.model.Namespace; import org.eclipse.rdf4j.model.Value; import org.eclipse.rdf4j.model.util.Values; @@ -21,24 +17,17 @@ * * @author Bart Hanssens */ -public class CellParserBoolean extends CellParser { - private String valueTrue; - private String valueFalse; - - @Override - public void setFormat(String format) { - String[] values = format.split("\\|"); - valueTrue = values[0]; - valueFalse = values[1]; - } +public class CellParserInteger extends CellParser { @Override public Value parse(String cell) { - String s = cell; - if ((s == null || s.isEmpty()) && (defaultValue != null)) { - s = defaultValue; + String s = getValueOrDefault(cell); + + if (s != null && groupChar != null) { + s = s.replace(groupChar, ""); } - return Values.literal(valueTrue.equals(s) ? "true" : "false", dataType); + + return Values.literal(s, dataType); } }