Skip to content

Commit

Permalink
eclipse-rdf4jGH-5058: additional parser code (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
barthanssens committed Jul 12, 2024
1 parent ce0c8e5 commit b7c58ec
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ public class CSVW {
/** csvw:header */
public static final IRI HEADER;

/** csvw:headerRowCount */
public static final IRI HEADER_ROW_COUNT;

/** csvw:lang */
public static final IRI LANG;

Expand All @@ -90,6 +93,9 @@ public class CSVW {
/** csvw:required */
public static final IRI REQUIRED;

/** csvw:skipRows */
public static final IRI SKIP_ROWS;

/** csvw:tableSchema */
public static final IRI TABLE_SCHEMA;

Expand Down Expand Up @@ -121,11 +127,13 @@ public class CSVW {
FORMAT = Vocabularies.createIRI(NAMESPACE, "format");
GROUP_CHAR = Vocabularies.createIRI(NAMESPACE, "groupChar");
HEADER = Vocabularies.createIRI(NAMESPACE, "header");
HEADER_ROW_COUNT = Vocabularies.createIRI(NAMESPACE, "headerRowCount");
LANG = Vocabularies.createIRI(NAMESPACE, "lang");
NAME = Vocabularies.createIRI(NAMESPACE, "name");
PROPERTY_URL = Vocabularies.createIRI(NAMESPACE, "propertyUrl");
QUOTE_CHAR = Vocabularies.createIRI(NAMESPACE, "quoteChar");
REQUIRED = Vocabularies.createIRI(NAMESPACE, "required");
SKIP_ROWS = Vocabularies.createIRI(NAMESPACE, "skipRows");
TABLE_SCHEMA = Vocabularies.createIRI(NAMESPACE, "tableSchema");
TABLES = Vocabularies.createIRI(NAMESPACE, "tables");
TITLE = Vocabularies.createIRI(NAMESPACE, "title");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,9 @@
* Currently only "minimal mode" is supported
*
* @author Bart Hanssens
* @see <a href="https://w3c.github.io/csvw/primer/">CSV on the Web Primer</a>
*
* @see <a href="https://w3c.github.io/csvw/primer/">CSV on the Web Primer</a>
* @see <a href="https://w3c.github.io/csvw/metadata">Metadata Vocabulary for Tabular Data</a>
* @since 5.1.0
*/
public class CSVWParser extends AbstractRDFParser {
Expand Down Expand Up @@ -225,6 +226,8 @@ private CellParser getCellParser(Model metadata, Resource column) {
Models.getPropertyString(metadata, column, CSVW.DEFAULT).ifPresent(v -> parser.setDefaultValue(v));
Models.getPropertyString(metadata, column, CSVW.REQUIRED)
.ifPresent(v -> parser.setIsRequired(Boolean.parseBoolean(v)));
Models.getPropertyString(metadata, column, CSVW.VIRTUAL)
.ifPresent(v -> parser.setVirtual(Boolean.parseBoolean(v)));

// only useful for strings
Models.getPropertyString(metadata, column, CSVW.LANG).ifPresent(v -> parser.setLang(v));
Expand Down Expand Up @@ -297,8 +300,10 @@ private Optional<String> getFormat(Model metadata, Resource column) {
* @param subject
* @return aboutURL or null
*/
private String getAboutURL(Model metadata, Resource subject) {
return Models.getPropertyString(metadata, subject, CSVW.ABOUT_URL).orElse(null);
private String getAboutURL(Model metadata, Resource table) {
return Models.getPropertyString(metadata, table, CSVW.ABOUT_URL)
.orElse(Models.getPropertyString(metadata, getTableSchema(metadata, table), CSVW.ABOUT_URL)
.orElse(null));
}

/**
Expand Down Expand Up @@ -336,7 +341,7 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse

// check for placeholder / column name that's being used to create subject IRI
int aboutIndex = getAboutIndex(aboutURL, cellParsers);
String placeholder = (aboutIndex > -1) ? cellParsers[aboutIndex].getName() : null;
String placeholder = (aboutIndex > -1) ? "{" + cellParsers[aboutIndex].getName() + "}" : null;

LOGGER.info("Parsing {}", csvFile);

Expand Down Expand Up @@ -382,14 +387,19 @@ private CSVReader getCSVReader(Model metadata, Resource table, Reader reader) {
Optional<Value> val = Models.getProperty(metadata, table, CSVW.DIALECT);
if (val.isPresent()) {
Resource dialect = (Resource) val.get();

// skip header (and possibly other) rows
String headerRows = Models.getPropertyString(metadata, dialect, CSVW.HEADER_ROW_COUNT).orElse("1");
String skipRows = Models.getPropertyString(metadata, dialect, CSVW.SKIP_ROWS).orElse("0");
int skip = Integer.valueOf(headerRows) + Integer.valueOf(skipRows);
Models.getPropertyString(metadata, dialect, CSVW.HEADER)
.ifPresent(v -> builder.withSkipLines(v.equalsIgnoreCase("false") ? 0 : skip));

Models.getPropertyString(metadata, dialect, CSVW.DELIMITER)
.ifPresent(v -> parserBuilder.withSeparator(v.charAt(0)));
Models.getPropertyString(metadata, dialect, CSVW.HEADER)
.ifPresent(v -> builder.withSkipLines(v.equalsIgnoreCase("false") ? 0 : 1));
Models.getPropertyString(metadata, dialect, CSVW.QUOTE_CHAR)
.ifPresent(v -> parserBuilder.withQuoteChar(v.charAt(0)));
}

return builder.withCSVParser(parserBuilder.build()).build();
}

Expand Down Expand Up @@ -423,7 +433,7 @@ private Resource getIRIorBnode(CellParser[] cellParsers, String[] cells, String
if (aboutIndex > -1) {
Value val = cellParsers[aboutIndex].parse(cells[aboutIndex]);
if (val != null) {
return Values.iri(aboutURL.replace(placeholder, val.toString()));
return Values.iri(aboutURL.replace(placeholder, val.stringValue()));
} else {
throw new RDFParseException("NULL value in aboutURL");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public abstract class CellParser {
protected String lang;
protected String defaultValue;
protected boolean isRequired;
protected boolean isVirtual;
protected IRI propertyIRI;
protected String valueUrl;
protected String format;
Expand Down Expand Up @@ -174,6 +175,20 @@ protected String getValueOrDefault(String s) {
return s;
}

/**
* @return true if the column is virtual
*/
public boolean isVirtual() {
return this.isVirtual;
}

/**
* @param isVirtual
*/
public void setVirtual(boolean isVirtual) {
this.isVirtual = isVirtual;
}

/**
* Get the value from a cell
*
Expand Down
11 changes: 8 additions & 3 deletions core/rio/csvw/src/test/resources/painters-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,23 @@
"@context": "http://www.w3.org/ns/csvw",
"url": "painters.csv",
"tableSchema": {
"aboutUrl": "https://www.wikidata.org/wiki/{wikidata_id}",
"columns": [
{ "name": "wikidata_id",
"datatype": "string",
"valueUrl": "https://www.wikidata.org/wiki/{wikidata_id}"},
"datatype": "string"},
{ "name": "first_name",
"propertyUrl": "schema:givenName"},
{ "name": "last_name",
"propertyUrl": "schema:familyName"},
{ "name": "country_id",
"propertyUrl": "schema:nationality",
"valueUrl": "https://www.wikidata.org/wiki/{country_id}"},
{ "name": "country_name_nl",
"lang": "nl" },
{ "name": "country_name_en",
"lang": "en" },
{ "name": "date_of_birth",
"propertyUrl": "schema:birthDate",
"datatype": {
"base": "date",
"format": "d/M/yyyy"
Expand All @@ -27,7 +29,10 @@
"format": "Yes|No"
} },
{ "name": "languages",
"separator": " " }
"separator": " " },
{ "virtual": true,
"propertyUrl": "rdf:type",
"valueUrl": "schema:Person" }
],
"primaryKey": "wikidata_id"
}
Expand Down

0 comments on commit b7c58ec

Please sign in to comment.