Skip to content

Commit

Permalink
For fixed-width columns, the delimiter is the space character.
Browse files Browse the repository at this point in the history
It is no longer parameterizeable
  • Loading branch information
kosak committed Nov 5, 2024
1 parent c77fc4c commit f3cb1c1
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 60 deletions.
2 changes: 1 addition & 1 deletion src/main/java/io/deephaven/csv/reading/CsvReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ private static Result fixedReadLogic(
final String[] headers = FixedHeaderFinder.determineHeadersToUse(specs, lineGrabber, columnWidths);
final int numCols = headers.length;
final CellGrabber grabber = new FixedCellGrabber(lineGrabber, columnWidths.getValue(),
specs.ignoreSurroundingSpaces(), (byte)specs.delimiter(), specs.useUtf32CountingConvention());
specs.ignoreSurroundingSpaces(), specs.useUtf32CountingConvention());
return commonReadLogic(specs, grabber, null, numCols, numCols, headers, sinkFactory);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ public static CellGrabber makeLineGrabber(InputStream stream) {
private final CellGrabber lineGrabber;
private final int[] columnWidths;
private final boolean ignoreSurroundingSpaces;
private final byte delimiterAsByte;
private final boolean utf32CountingMode;
private final ByteSlice rowText;
private boolean needsUnderlyingRefresh;
Expand All @@ -39,11 +38,10 @@ public static CellGrabber makeLineGrabber(InputStream stream) {

/** Constructor. */
public FixedCellGrabber(final CellGrabber lineGrabber, final int[] columnWidths, boolean ignoreSurroundingSpaces,
byte delimiterAsByte, boolean utf32CountingMode) {
boolean utf32CountingMode) {
this.lineGrabber = lineGrabber;
this.columnWidths = columnWidths;
this.ignoreSurroundingSpaces = ignoreSurroundingSpaces;
this.delimiterAsByte = delimiterAsByte;
this.utf32CountingMode = utf32CountingMode;
this.rowText = new ByteSlice();
this.needsUnderlyingRefresh = true;
Expand Down Expand Up @@ -79,7 +77,7 @@ public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean en
endOfInput.setValue(false);

if (ignoreSurroundingSpaces) {
dest.trimPadding(delimiterAsByte);
ReaderUtil.trimWhitespace(dest);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,12 @@ public static String[] determineHeadersToUse(
}
--skipCount;
}
final byte paddingByte = (byte) specs.delimiter();
if (columnWidthsToUse.length == 0) {
columnWidthsToUse = inferColumnWidths(headerRow, paddingByte, specs.useUtf32CountingConvention());
columnWidthsToUse = inferColumnWidths(headerRow, specs.useUtf32CountingConvention());
}

headersToUse =
extractHeaders(headerRow, columnWidthsToUse, paddingByte, specs.useUtf32CountingConvention());
extractHeaders(headerRow, columnWidthsToUse, specs.useUtf32CountingConvention());
} else {
if (columnWidthsToUse.length == 0) {
throw new CsvReaderException(
Expand Down Expand Up @@ -81,17 +80,16 @@ public static String[] determineHeadersToUse(
/**
* Infer the column widths by looking for the transition from delimiter char to non-delimiter char.
* @param row The input row
* @param delimiterAsByte The delimiter. As elsewhere, only 7-bit ASCII delimiters are supported.
* @param useUtf32CountingConvention The character set convention we are using for units of width (either UTF-32 or
* UTF-16)
* @return The widths of the columns, in the specified character set convention.
*/
private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, boolean useUtf32CountingConvention) {
private static int[] inferColumnWidths(ByteSlice row, boolean useUtf32CountingConvention) {
// A column start is a non-delimiter character preceded by a delimiter (or present at the start of line).
// If the start of the line is a delimiter, that is an error.
final List<Integer> columnWidths = new ArrayList<>();
final MutableInt charCountResult = new MutableInt();
boolean prevCharIsDelimiter = false;
boolean prevCharIsSpace = false;
final byte[] data = row.data();
int numChars = 0;
int currentIndex = row.begin();
Expand All @@ -102,17 +100,15 @@ private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, bool
}
// If this character is not a delimiter, but the previous one was, then this is the start of a new column.
byte ch = data[currentIndex];
boolean thisCharIsDelimiter = ch == delimiterAsByte;
if (currentIndex == row.begin() && thisCharIsDelimiter) {
throw new IllegalArgumentException(
String.format("Header row cannot start with the delimiter character '%c'",
(char) delimiterAsByte));
boolean thisCharIsSpace = ch == ' ';
if (currentIndex == row.begin() && thisCharIsSpace) {
throw new IllegalArgumentException("Header row cannot start with a space");
}
if (!thisCharIsDelimiter && prevCharIsDelimiter) {
if (!thisCharIsSpace && prevCharIsSpace) {
columnWidths.add(numChars);
numChars = 0;
}
prevCharIsDelimiter = thisCharIsDelimiter;
prevCharIsSpace = thisCharIsSpace;
final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(ch, row.end() - currentIndex,
useUtf32CountingConvention, charCountResult);
currentIndex += utf8Length;
Expand All @@ -124,12 +120,10 @@ private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, bool
* Extract the headers names from 'row'.
* @param row The header row
* @param columnWidths The width of the columns, in the UTF-32 or UTF-16 counting convention.
* @param paddingByte The delimiter character
* @param utf32CountingMode Whether we are in the UTF-32 or UTF-16 counting mode
* @return The array of headers
*/
private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte paddingByte,
boolean utf32CountingMode) {
private static String[] extractHeaders(ByteSlice row, int[] columnWidths, boolean utf32CountingMode) {
final int numCols = columnWidths.length;
if (numCols == 0) {
return new String[0];
Expand All @@ -146,7 +140,7 @@ private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte p
final int proposedEndByte = beginByte + byteWidths[colNum];
final int actualEndByte = Math.min(proposedEndByte, row.end());
tempSlice.reset(row.data(), beginByte, actualEndByte);
tempSlice.trimPadding(paddingByte);
ReaderUtil.trimWhitespace(tempSlice);
result[colNum] = tempSlice.toString();
beginByte = actualEndByte;
}
Expand Down
48 changes: 10 additions & 38 deletions src/test/java/io/deephaven/csv/CsvReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1884,7 +1884,7 @@ public void bug212() throws CsvReaderException {
+ "argocd Active 5y18d kubernetes.io/metadata.name=argocd\n"
+ "beta Not Active 4y235d kubernetes.io/metadata.name=beta\n";

final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true)
.ignoreSurroundingSpaces(true).build();

final ColumnSet expected = ColumnSet.of(
Expand Down Expand Up @@ -1917,7 +1917,7 @@ public void simpleFixedColumnWidths() throws CsvReaderException {
Column.ofValues("SecurityId", 200, 300, 500));

final CsvSpecs specs =
defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build();
defaultCsvBuilder().hasFixedWidthColumns(true).ignoreSurroundingSpaces(true).build();

invokeTest(specs, input, expected);
}
Expand All @@ -1943,7 +1943,7 @@ public void fixedColumnWidthsFullCell() throws CsvReaderException {
Column.ofValues("SecurityId", 200, 300));

final CsvSpecs specs =
defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build();
defaultCsvBuilder().hasFixedWidthColumns(true).ignoreSurroundingSpaces(true).build();
invokeTest(specs, input, expected);
}

Expand All @@ -1968,7 +1968,7 @@ public void fixedColumnWidthsShortRows(boolean allowMissingColumns) throws CsvRe
Column.ofValues("Price", Sentinels.NULL_DOUBLE, 0.15, 0.18, Sentinels.NULL_DOUBLE),
Column.ofValues("SecurityId", Sentinels.NULL_INT, 300, 500, Sentinels.NULL_INT));

final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true)
.ignoreSurroundingSpaces(true).allowMissingColumns(allowMissingColumns).build();

if (allowMissingColumns) {
Expand All @@ -1979,32 +1979,6 @@ public void fixedColumnWidthsShortRows(boolean allowMissingColumns) throws CsvRe
}
}

/**
* We support other ASCII delimiters. In fixed-width mode, the meaning of "ignoreSurroundingSpaces" is expanded
* to mean "ignore surrounding delimiters".
*/
@Test
public void alternateDelimiter() throws CsvReaderException {
final String input =
""
+ "Sym___Type_____Price___SecurityId\n"
+ "GOOG__Dividend_0.25____200\n"
+ "T_____Dividend_0.15____300\n"
+ "Z_____Dividend_0.18____500\n";

final ColumnSet expected =
ColumnSet.of(
Column.ofRefs("Sym", "GOOG", "T", "Z"),
Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"),
Column.ofValues("Price", 0.25, 0.15, 0.18),
Column.ofValues("SecurityId", 200, 300, 500));

final CsvSpecs specs =
defaultCsvBuilder().hasFixedWidthColumns(true).delimiter('_').ignoreSurroundingSpaces(true).build();

invokeTest(specs, input, expected);
}

/**
* If there is no header row, the caller needs to specify column widths.
*/
Expand All @@ -2025,7 +1999,7 @@ public void noHeaderRowRequiresFixColumnWidthsSpecified(boolean specifyColumnWid
Column.ofValues("Column4", 200, 300, 500));

final CsvSpecs.Builder specsBase = defaultCsvBuilder().hasFixedWidthColumns(true).hasHeaderRow(false)
.delimiter(' ').ignoreSurroundingSpaces(true);
.ignoreSurroundingSpaces(true);

if (specifyColumnWidths) {
final CsvSpecs specs = specsBase.fixedColumnWidths(Arrays.asList(6, 9, 8, 3)).build();
Expand Down Expand Up @@ -2063,7 +2037,7 @@ public void columnNamesMayBeSpecified(boolean specifyColumnNames) throws CsvRead
Column.ofValues(expectedColumnNames[3], 200, 300, 500));

CsvSpecs.Builder specsBuilder = defaultCsvBuilder().hasFixedWidthColumns(true).hasHeaderRow(false)
.delimiter(' ').ignoreSurroundingSpaces(true).fixedColumnWidths(Arrays.asList(6, 9, 8, 3));
.ignoreSurroundingSpaces(true).fixedColumnWidths(Arrays.asList(6, 9, 8, 3));

if (specifyColumnNames) {
specsBuilder = specsBuilder.headers(Arrays.asList(expectedColumnNames));
Expand Down Expand Up @@ -2092,7 +2066,7 @@ public void countsBMPCharactersTheSame(boolean useUtf32CountingConvention) throw
Column.ofValues("Price", 0.15, 0.18),
Column.ofValues("SecurityId", 300, 500));

final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true)
.ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build();

invokeTest(specs, input, expected);
Expand Down Expand Up @@ -2125,7 +2099,7 @@ public void countsNonBMPCharactersDifferently(boolean useUtf32CountingConvention
Column.ofRefs("Type", "💓💕💖Dividend", "Dividend"));
}

final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true)
.ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build();

invokeTest(specs, input, expected);
Expand Down Expand Up @@ -2166,7 +2140,7 @@ public void unicodeColumnHeaders(boolean useUtf32CountingConvention) throws CsvR
Column.ofRefs("╔═╤═╗", "gh"));
}

final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true)
.ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build();

invokeTest(specs, input, expected);
Expand Down Expand Up @@ -2206,14 +2180,12 @@ public void brokenSurrogatePair(boolean useUtf32CountingConvention) throws CsvRe
Column.ofRefs("C2", "😻 🧡💓"));
}

final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true)
.ignoreSurroundingSpaces(false).useUtf32CountingConvention(useUtf32CountingConvention).build();

invokeTest(specs, input, expected);
}



private static final class RepeatingInputStream extends InputStream {
private byte[] data;
private final byte[] body;
Expand Down

0 comments on commit f3cb1c1

Please sign in to comment.