Skip to content

Commit

Permalink
eclipse-rdf4jGH-5058: additional parser code (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
barthanssens committed Jul 15, 2024
1 parent 2b1d57f commit 489756f
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import java.util.Optional;
import java.util.stream.Collectors;

import org.apache.commons.lang3.CharSet;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
Expand All @@ -48,6 +47,7 @@
import org.eclipse.rdf4j.rio.csvw.parsers.CellParser;
import org.eclipse.rdf4j.rio.csvw.parsers.CellParserFactory;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser;

import org.slf4j.LoggerFactory;

import com.opencsv.CSVParser;
Expand All @@ -57,14 +57,17 @@
import com.opencsv.exceptions.CsvValidationException;

/**
* Basic (experimental) CSV on the Web parser.
*
* Currently only "minimal mode" is supported
* Experimental CSV on the Web parser.
*
* @author Bart Hanssens
*
* Basically it consists of an existing CSV file and a metadata file (in JSON-LD) describing the columns.
* Parsers need to convert the data client-side.
*
* @see <a href="https://w3c.github.io/csvw/primer/">CSV on the Web Primer</a>
* @see <a href="https://w3c.github.io/csvw/syntax/">Model for Tabular Data and Metadata on the Web</a>
* @see <a href="https://w3c.github.io/csvw/metadata">Metadata Vocabulary for Tabular Data</a>
*
* @since 5.1.0
*/
public class CSVWParser extends AbstractRDFParser {
Expand All @@ -86,7 +89,7 @@ public synchronized void parse(InputStream in, String baseURI)
throw new RDFParseException("No metadata found");
}

RDFHandler rdfHandler = getRDFHandler();
rdfHandler = getRDFHandler();

boolean minimal = getParserConfig().get(CSVWParserSettings.MINIMAL_MODE);
Resource rootNode = minimal ? null : generateTablegroupNode(rdfHandler);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@
*******************************************************************************/
package org.eclipse.rdf4j.rio.csvw.parsers;

import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand All @@ -29,6 +32,7 @@ public abstract class CellParser {
private static final Pattern PLACEHOLDERS = Pattern.compile("(\\{#?_?[^\\}]+\\})");

private String name;
private String encodedName;
private IRI dataType;
private String lang;
private String defaultValue;
Expand All @@ -45,18 +49,45 @@ public abstract class CellParser {
private boolean virtual = false;
private boolean suppressed = false;

private String aboutPlaceholder;
private String[] aboutPlaceholders;
private String[] propPlaceholders;
private String[] valPlaceholders;

private String valuePlaceholder;
private String[] valuePlaceholders;

/**
* Get name of the column
*
* @return
*/
public String getName() {
return name;
}

/**
* Get URL encoded name
*
* @return encoded name
*/
public String getNameEncoded() {
return encodedName;
}

/**
* Set name of the column
*
* @param name
*/
public void setName(String name) {
this.name = name;
this.encodedName = URLEncoder.encode(name, StandardCharsets.UTF_8);
}


/**
* Get datatype
*
* @return
*/
public IRI getDataType() {
return dataType;
}
Expand Down Expand Up @@ -135,21 +166,39 @@ public void setVirtual(boolean virtual) {
this.virtual = virtual;
}


/**
* Extract placeholders (if any)
* Extract placeholder name for the own column, if any
*
* @param template URI template string
* @return array of placeholders
* @return placeholder name or null
*/
private String[] extractPlaceholders(String template) {
private String getOwnPlaceholder(String template) {
if (encodedName != null) {
String placeholder = "{" + encodedName + "}";
if (template.contains(placeholder)) {
return placeholder;
}
}
return null;
}

/**
* Extract placeholder names for (values of) other columns, if any
*
* @param template URI template string
* @return array of placeholder names
*/
private String[] getPlaceholders(String template) {
Matcher matcher = PLACEHOLDERS.matcher(template);
String ownPlaceholder = getOwnPlaceholder(template);

if (matcher.find()) {
int matches = matcher.groupCount();
String[] placeholders = new String[matches];
for (int i = 0; i < matches; i++) {
placeholders[i] = matcher.group(i + 1);
}
return placeholders;
Set<String> placeholders = matcher.results()
.map(m -> m.group())
.filter(m -> !m.equals(ownPlaceholder))
.collect(Collectors.toSet());
return placeholders.toArray(new String[placeholders.size()]);
}
return null;
}
Expand All @@ -170,7 +219,9 @@ public String getAboutUrl() {
*/
public void setAboutUrl(String aboutUrl) {
this.aboutUrl = aboutUrl;
this.aboutPlaceholders = extractPlaceholders(aboutUrl);
// check if this URL contains column placeholders
this.aboutPlaceholder = getOwnPlaceholder(aboutUrl);
this.aboutPlaceholders = getPlaceholders(aboutUrl);
}

/**
Expand Down Expand Up @@ -217,7 +268,8 @@ public String getValueUrl() {
*/
public void setValueUrl(String valueUrl) {
this.valueUrl = valueUrl;
this.valPlaceholders = extractPlaceholders(valueUrl);
// check if this URL contains column placeholders
this.valuePlaceholders = getPlaceholders(valueUrl);
}

/**
Expand Down

0 comments on commit 489756f

Please sign in to comment.