Skip to content

Commit

Permalink
Custom parsers / null literals should use pre-legalized names (issue #…
Browse files Browse the repository at this point in the history
  • Loading branch information
kosak committed Nov 9, 2024
1 parent 4cc4156 commit 8c1dc43
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 10 deletions.
9 changes: 6 additions & 3 deletions src/main/java/io/deephaven/csv/CsvSpecs.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,12 @@ public interface Builder {
Builder customTimeZoneParser(Tokenizer.CustomTimeZoneParser customTimeZoneParser);

/**
* An optional legalizer for column headers. The legalizer is a function that takes column names (as a
* {@code String[]}) names and returns legal column names (as a {@code String[]}). The legalizer function is
* permitted to reuse its input data structure. Defaults to {@code Function#identity()}.
* An optional legalizer for column headers. The legalizer is a function that takes the column names from the
* input (as a {@code String[]}) and returns "legal" column names (as a {@code String[]}). What constitutes
* "legal" is entirely up to the caller. For example, some applications cannot tolerate punctuation characters
* in column names and need to remove them. The CSV library itself has no limitations with regard to column names.
* The legalizer function is permitted to return the input array (perhaps with some elements modified) as the
* return value. Defaults to {@code Function#identity()}.
*/
Builder headerLegalizer(Function<String[], String[]> headerLegalizer);

Expand Down
16 changes: 9 additions & 7 deletions src/main/java/io/deephaven/csv/reading/CsvReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,7 @@ private static Result delimitedReadLogic(
headersTemp2 = headersTemp;
}
final int numOutputCols = headersTemp2.length;
final String[] headersToUse = canonicalizeHeaders(specs, headersTemp2);

return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersToUse, sinkFactory);
return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersTemp2, sinkFactory);
}

private static Result fixedReadLogic(
Expand All @@ -113,14 +111,17 @@ private static Result fixedReadLogic(

private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber, byte[][] optionalFirstDataRow,
int numInputCols, int numOutputCols,
String[] headersToUse, final SinkFactory sinkFactory)
String[] headersBeforeLegalization, final SinkFactory sinkFactory)
throws CsvReaderException {

final String[][] nullValueLiteralsToUse = new String[numOutputCols][];
for (int ii = 0; ii < numOutputCols; ++ii) {
nullValueLiteralsToUse[ii] =
calcNullValueLiteralsToUse(specs, headersToUse[ii], ii).toArray(new String[0]);
calcNullValueLiteralsToUse(specs, headersBeforeLegalization[ii], ii).toArray(new String[0]);
}

final String[] headersToUse = canonicalizeHeaders(specs, headersBeforeLegalization);

// Create a DenseStorageWriter for each column. The arrays are sized to "numInputCols" but only populated up to
// "numOutputCols". The remaining (numInputCols - numOutputCols) are set to null. The code in
// parseInputToDenseStorge knows that having a null DenseStorageWriter means that the column is all-empty and
Expand Down Expand Up @@ -156,7 +157,7 @@ private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber,
final ArrayList<Future<Object>> sinkFutures = new ArrayList<>();
try {
for (int ii = 0; ii < numOutputCols; ++ii) {
final List<Parser<?>> parsersToUse = calcParsersToUse(specs, headersToUse[ii], ii);
final List<Parser<?>> parsersToUse = calcParsersToUse(specs, headersBeforeLegalization[ii], ii);

final int iiCopy = ii;
final Future<Object> fcb = ecs.submit(
Expand Down Expand Up @@ -230,7 +231,8 @@ private static List<String> calcNullValueLiteralsToUse(final CsvSpecs specs, fin
}

private static String[] canonicalizeHeaders(CsvSpecs specs, final String[] headers) throws CsvReaderException {
final String[] legalized = specs.headerLegalizer().apply(headers);
// legalizer is allowed to mutate the input in-place, so we clone it before passing it.
final String[] legalized = specs.headerLegalizer().apply(headers.clone());
final Set<String> unique = new HashSet<>();
final List<String> repeats = new ArrayList<>();
final List<String> invalidNames = new ArrayList<>();
Expand Down
66 changes: 66 additions & 0 deletions src/test/java/io/deephaven/csv/CsvReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.junit.jupiter.api.Timeout;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
import org.junit.jupiter.params.provider.CsvSources;
import org.junit.jupiter.params.provider.ValueSource;

import java.io.*;
Expand All @@ -41,6 +42,7 @@
import java.util.Collections;
import java.util.List;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;

Expand Down Expand Up @@ -206,6 +208,70 @@ public void bug162() throws CsvReaderException {
invokeTest(defaultCsvBuilder().parsers(Parsers.DEFAULT).build(), input, expected);
}

/**
* Reported in <a href="https://github.com/deephaven/deephaven-csv/issues/190">Deephaven CSV Issue #190</a>. That
* issue report misidentifies the root cause as having to do with reserved keywords. This is not correct because the
* library doesn't care whether a column header is a reserved keyword. The actual root cause is an interaction
* between the user-supplied "legalizer" and user-specified parsers or null literals that are specified by column
* names. Specifically the question is whether column names mentioned in {@link CsvSpecs.Builder#putParserForName}
* and {@link CsvSpecs.Builder#putNullValueLiteralsForName} should refer to the name that the column had *before* it
* was transformed by the legalizer, or *after*. The expected behavior is "before", but prior to this fix the
* library was doing the "after" behavior. This is a parameterized test that invokes the behavior for {delimited,
* fixed columns} x {without and with a legalizer}.
*/
@ParameterizedTest
@CsvSource({"false,false", "false,true", "true,false", "true,true"})
public void bug190(boolean hasFixedWidthColumns, boolean invokeLegalizer) throws CsvReaderException {
// +++ is the null value literal for Col1
// *** is the null value literal for Col2
// ??? is the null value literal for Col3

final String input;

if (!hasFixedWidthColumns) {
input = "Col1,Col2,Col3\n" +
"+++,20,30\n" +
"100,***,300\n" +
"1000,2000,???\n";
} else {
input = "Col1 Col2 Col3\n" +
"+++ 20 30\n" +
"100 *** 300\n" +
"1000 2000 ???\n";
}

final String[] expectedColumnNames = !invokeLegalizer ? new String[] {"Col1", "Col2", "Col3"}
: new String[] {"xyzCol1", "xyzCol2", "xyzCol3"};

final ColumnSet expected =
ColumnSet.of(
Column.ofValues(expectedColumnNames[0], Sentinels.NULL_LONG, (long) 100, (long) 1000),
Column.ofValues(expectedColumnNames[1], (double) 20, Sentinels.NULL_DOUBLE, (double) 2000),
Column.ofRefs(expectedColumnNames[2], "30", "300", null));

Function<String[], String[]> legalizer = in -> {
for (int i = 0; i != in.length; ++i) {
// e.g. transform Col1 to xyzCol1
in[i] = "xyz" + in[i];
}
return in;
};

CsvSpecs.Builder specsBase =
defaultCsvBuilder().hasFixedWidthColumns(hasFixedWidthColumns).parsers(Parsers.DEFAULT)
.putParserForName("Col1", Parsers.LONG).putParserForName("Col2", Parsers.DOUBLE)
.putParserForName("Col3", Parsers.STRING)
.putNullValueLiteralsForName("Col1", Collections.singletonList("+++"))
.putNullValueLiteralsForName("Col2", Collections.singletonList("***"))
.putNullValueLiteralsForName("Col3", Collections.singletonList("???"));

if (invokeLegalizer) {
specsBase = specsBase.headerLegalizer(legalizer);
}

invokeTest(specsBase.build(), input, expected);
}

@Test
public void validates() {
final String lengthyMessage = "CsvSpecs failed validation for the following reasons: "
Expand Down

0 comments on commit 8c1dc43

Please sign in to comment.