Skip to content

Commit

Permalink
eclipse-rdf4jGH-5058: added code for CSV without JSON-LD metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
barthanssens committed Aug 29, 2024
1 parent 03cb21e commit 5b73659
Show file tree
Hide file tree
Showing 7 changed files with 216 additions and 73 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,13 @@
import java.io.Reader;
import java.net.URI;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;

import org.eclipse.rdf4j.model.BNode;
Expand All @@ -34,7 +32,6 @@
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.base.CoreDatatype.XSD;
import org.eclipse.rdf4j.model.impl.SimpleNamespace;
import org.eclipse.rdf4j.model.util.Models;
import org.eclipse.rdf4j.model.util.RDFCollections;
Expand All @@ -49,15 +46,15 @@
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.rio.csvw.parsers.CellParser;
import org.eclipse.rdf4j.rio.csvw.parsers.CellParserFactory;
import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser;
import org.eclipse.rdf4j.rio.helpers.JSONLDSettings;
import org.slf4j.LoggerFactory;

import com.opencsv.CSVParser;
import com.opencsv.CSVParserBuilder;
import com.opencsv.CSVReader;
import com.opencsv.CSVReaderBuilder;
import com.opencsv.exceptions.CsvValidationException;
import org.eclipse.rdf4j.model.impl.LinkedHashModel;
import org.eclipse.rdf4j.rio.csvw.metadata.CSVWMetadataProvider;
import org.slf4j.Logger;

/**
* Experimental CSV on the Web parser.
Expand All @@ -74,7 +71,11 @@
* @since 5.1.0
*/
public class CSVWParser extends AbstractRDFParser {
private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(CSVWParser.class);
private static final Logger LOGGER = LoggerFactory.getLogger(CSVWParser.class);

private static final ParserConfig METADATA_CFG =
new ParserConfig().set(JSONLDSettings.WHITELIST, Set.of("http://www.w3.org/ns/csvw"));


@Override
public RDFFormat getRDFFormat() {
Expand All @@ -86,11 +87,8 @@ public synchronized void parse(InputStream in, String baseURI)
throws IOException, RDFParseException, RDFHandlerException {

clear();

Model metadata = parseMetadata(in, null, baseURI);
if (metadata == null || metadata.isEmpty()) {
throw new RDFParseException("No metadata found");
}

Model metadata = getMetadataAsModel(in);

rdfHandler = getRDFHandler();

Expand Down Expand Up @@ -122,31 +120,31 @@ public synchronized void parse(InputStream in, String baseURI)
@Override
public void parse(Reader reader, String baseURI)
throws IOException, RDFParseException, RDFHandlerException {

Model metadata = parseMetadata(null, reader, baseURI);
throw new IOException("not implemented yet");
// Model metadata = parseMetadata(null, reader, baseURI);
}

/**
* Parse JSON-LD metadata
*
* @param in
* @param reader
* @param baseURI
* @return
* @throws IOException
*/
private Model parseMetadata(InputStream in, Reader reader, String baseURI) throws IOException {
Model metadata = null;
ParserConfig cfg = new ParserConfig();

private Model getMetadataAsModel(InputStream in) throws IOException {
Model m = null;
InputStream metadata = null;

if (in != null) {
metadata = Rio.parse(in, null, RDFFormat.JSONLD, cfg);
if (getParserConfig().get(CSVWParserSettings.METADATA_INPUT_MODE)) {
metadata = in;
} else {
CSVWMetadataProvider provider = getParserConfig().get(CSVWParserSettings.METADATA_FINDER);
if (provider != null) {
metadata = provider.getMetadata();
}
}

// if (reader != null) {
// return Rio.parse(reader, baseURI, RDFFormat.JSONLD, cfg);
// }
return metadata;
if (metadata != null) {
m = Rio.parse(metadata, null, RDFFormat.JSONLD, METADATA_CFG);
}
if (m == null) {
LOGGER.warn("No metadata found");
m = new LinkedHashModel();
}
return m;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
package org.eclipse.rdf4j.rio.csvw;

import org.eclipse.rdf4j.rio.RioSetting;
import org.eclipse.rdf4j.rio.csvw.metadata.CSVWMetadataFinder;
import org.eclipse.rdf4j.rio.csvw.metadata.CSVWMetadataProvider;
import org.eclipse.rdf4j.rio.helpers.BooleanRioSetting;
import org.eclipse.rdf4j.rio.helpers.ClassRioSetting;
import org.eclipse.rdf4j.rio.helpers.StringRioSetting;

/**
* ParserSettings for the CSV on the Web parser features.
Expand All @@ -25,25 +29,68 @@
public class CSVWParserSettings {

/**
* Boolean setting for parser to determine whether syntactically invalid lines in CSVW generate a parse error.
* Boolean setting for parser to determine whether 'minimal mode' is to be used. I.e. only produce triples from the
* data cells, without adding table metadata .
* <p>
* Defaults to false.
* <p>
* Can be overridden by setting system property {@code org.eclipse.rdf4j.rio.csvw.minimal_mode}
*/
public static final BooleanRioSetting MINIMAL_MODE = new BooleanRioSetting(
"org.eclipse.rdf4j.rio.csvw.minimal_mode", "CSVWeb minimal mode", Boolean.FALSE);


/**
* String setting for parser to provide location of a JSON metadata file.
*
* This implies that the parser's InputStream or Reader parameter points to a CSV file,
* not to a JSON-LD metadata file
* <p>
* Defaults to true.
* <p>
* Can be overridden by setting system property {@code org.eclipse.rdf4j.rio.ntriples.fail_on_invalid_lines}
* Can be overridden by setting system property {@code org.eclipse.rdf4j.rio.csvw.metadata_json_file}
*/
public static final BooleanRioSetting FAIL_ON_INVALID_LINES = new BooleanRioSetting(
"org.eclipse.rdf4j.rio.csvw.fail_on_invalid_lines", "Fail on CSVW invalid lines", Boolean.TRUE);
public static final BooleanRioSetting METADATA_INPUT_MODE = new BooleanRioSetting(
"org.eclipse.rdf4j.rio.csvw.metadata_uri", "Location of JSON metadata file", true);


/**
* Boolean setting for parser to determine whether 'minimal mode' is to be used. I.e. only produce triples from the
* data cells, without adding table metadata .
* String setting for parser to provide location of a JSON metadata file.
*
* This implies that the parser's InputStream or Reader parameter points to a CSV file,
* not to a JSON-LD metadata file
* <p>
* Defaults to false.
* Defaults to null.
* <p>
* Can be overridden by setting system property {@code org.eclipse.rdf4j.rio.csvw.minimal_mode}
* Can be overridden by setting system property {@code org.eclipse.rdf4j.rio.csvw.metadata_json_file}
*/
public static final BooleanRioSetting MINIMAL_MODE = new BooleanRioSetting(
"org.eclipse.rdf4j.rio.csvw.minimal_mode", "", Boolean.FALSE);
public static final StringRioSetting METADATA_URI = new StringRioSetting(
"org.eclipse.rdf4j.rio.csvw.metadata_uri", "Location of JSON metadata file", null);

/**
* Class setting for parser to provide a metadata provider
*
* This implies that the parser's InputStream or Reader parameter points to a CSV file,
* not to a JSON-LD metadata file
* <p>
* Defaults to CSVWMetadataFinder.
* <p>
* Can be overridden by setting system property {@code org.eclipse.rdf4j.rio.csvw.metadata_finder}
*/
public static final ClassRioSetting<? extends CSVWMetadataProvider> METADATA_FINDER = new ClassRioSetting<>(
"org.eclipse.rdf4j.rio.csvw.metadata_finder", "Metadata provider class", new CSVWMetadataFinder());

/**
* String setting for parser to provide location of the CSV data file.
*
* <p>
* Defaults to empty.
* <p>
* Can be overridden by setting system property {@code org.eclipse.rdf4j.rio.csvw.metadata_json_file}
*/
public static final StringRioSetting DATA_URL = new StringRioSetting(
"org.eclipse.rdf4j.rio.csvw.data_url", "Location (URL) of the CSV data", "");


/**
* Private constructor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.rio.csvw;
package org.eclipse.rdf4j.rio.csvw.metadata;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.nio.charset.StandardCharsets;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -25,62 +26,53 @@
*
* @author Bart Hanssens
*/
public class CSVWMetadataFinder {
public class CSVWMetadataFinder implements CSVWMetadataProvider {
private static final Logger LOGGER = LoggerFactory.getLogger(CSVWMetadataFinder.class);

private static final String WELL_KNOWN = "/.well-known/csvm";
private static final String METADATA_EXT = "-metadata.json";
private static final String METADATA_CSV = "csv-metadata.json";
private static final String CSV = ".csv";

/**
* Open URI as input stream
*
* @param uri
* @return
*/
private static InputStream openURI(URI uri) {
try (InputStream is = uri.toURL().openStream()) {
return new ByteArrayInputStream(is.readAllBytes());
} catch (IOException ioe) {
LOGGER.debug("Could not open {}", uri);
return null;
}
private ByteArrayInputStream buffer;


@Override
public InputStream getMetadata() {
return buffer;
}


/**
* Find by adding metadata.json as file extension
*
* @param csvFile
* @return inputstream or null
*/
public static InputStream findByExtension(URI csvFile) {
public void findByExtension(URI csvFile) {
String s = csvFile.toString();
if (s.endsWith(CSV)) {
s = s.substring(0, s.length() - CSV.length());
}
URI metaURI = URI.create(s + METADATA_EXT);
return openURI(metaURI);
buffer = openURI(metaURI);
}

/**
* Find by trying to get the csv-metadata.json in the path
*
* @param csvFile
* @return inputstream or null
*/
public static InputStream findInPath(URI csvFile) {
public void findInPath(URI csvFile) {
URI metaURI = csvFile.resolve(METADATA_CSV);
return openURI(metaURI);
buffer = openURI(metaURI);
}

/**
* Try reading the well-known location
*
* @param csvFile
* @return URI or null
*/
public static InputStream findByWellKnown(URI csvFile) {
public void findByWellKnown(URI csvFile) {
URI wellKnown = csvFile.resolve(WELL_KNOWN);

try (InputStream is = wellKnown.toURL().openStream();
Expand All @@ -104,7 +96,7 @@ public static InputStream findByWellKnown(URI csvFile) {
metaURI = URI.create(s);
}
try (InputStream meta = metaURI.toURL().openStream()) {
return new ByteArrayInputStream(meta.readAllBytes());
buffer = new ByteArrayInputStream(meta.readAllBytes());
} catch (IOException ioe) {
LOGGER.debug("Could not open {}", metaURI);
}
Expand All @@ -113,6 +105,36 @@ public static InputStream findByWellKnown(URI csvFile) {
} catch (IOException ioe) {
LOGGER.info("Could not open {}", wellKnown);
}
return null;
}

/**
* Try different ways to obtain CSVW metadata file
*
* @param csvFile
*/
public void find(URI csvFile) {
buffer = null;
findByExtension(csvFile);
if (buffer == null) {
findInPath(csvFile);
}
if (buffer == null) {
findByWellKnown(csvFile);
}
}

/**
* Open URI as input stream
*
* @param uri
* @return
*/
private ByteArrayInputStream openURI(URI uri) {
try (InputStream is = uri.toURL().openStream()) {
return new ByteArrayInputStream(is.readAllBytes());
} catch (IOException ioe) {
LOGGER.debug("Could not open {}", uri);
return null;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*******************************************************************************
* Copyright (c) 2024 Eclipse RDF4J contributors.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.rio.csvw.metadata;

import java.io.InputStream;

/**
* Empty metadata provider.
* Mostly for testing purposes, since not providing metadata will result in relative subject/property URIs
*
* @author Bart Hanssens
*/
public class CSVWMetadataNone implements CSVWMetadataProvider {

@Override
public InputStream getMetadata() {
return null;
}
}
Loading

0 comments on commit 5b73659

Please sign in to comment.