From f3cb1c137799164e0143f5a83675e54561e2f329 Mon Sep 17 00:00:00 2001 From: Corey Kosak Date: Mon, 4 Nov 2024 22:11:34 -0500 Subject: [PATCH] For fixed-width columns, the delimiter is the space character. It is no longer parameterizeable --- .../io/deephaven/csv/reading/CsvReader.java | 2 +- .../csv/reading/cells/FixedCellGrabber.java | 6 +-- .../reading/headers/FixedHeaderFinder.java | 28 +++++------ .../java/io/deephaven/csv/CsvReaderTest.java | 48 ++++--------------- 4 files changed, 24 insertions(+), 60 deletions(-) diff --git a/src/main/java/io/deephaven/csv/reading/CsvReader.java b/src/main/java/io/deephaven/csv/reading/CsvReader.java index a6fb63d..68899ae 100644 --- a/src/main/java/io/deephaven/csv/reading/CsvReader.java +++ b/src/main/java/io/deephaven/csv/reading/CsvReader.java @@ -107,7 +107,7 @@ private static Result fixedReadLogic( final String[] headers = FixedHeaderFinder.determineHeadersToUse(specs, lineGrabber, columnWidths); final int numCols = headers.length; final CellGrabber grabber = new FixedCellGrabber(lineGrabber, columnWidths.getValue(), - specs.ignoreSurroundingSpaces(), (byte)specs.delimiter(), specs.useUtf32CountingConvention()); + specs.ignoreSurroundingSpaces(), specs.useUtf32CountingConvention()); return commonReadLogic(specs, grabber, null, numCols, numCols, headers, sinkFactory); } diff --git a/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java index 5402fd1..1732010 100644 --- a/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java +++ b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java @@ -29,7 +29,6 @@ public static CellGrabber makeLineGrabber(InputStream stream) { private final CellGrabber lineGrabber; private final int[] columnWidths; private final boolean ignoreSurroundingSpaces; - private final byte delimiterAsByte; private final boolean utf32CountingMode; private final ByteSlice rowText; private boolean needsUnderlyingRefresh; @@ -39,11 +38,10 @@ public static CellGrabber makeLineGrabber(InputStream stream) { /** Constructor. */ public FixedCellGrabber(final CellGrabber lineGrabber, final int[] columnWidths, boolean ignoreSurroundingSpaces, - byte delimiterAsByte, boolean utf32CountingMode) { + boolean utf32CountingMode) { this.lineGrabber = lineGrabber; this.columnWidths = columnWidths; this.ignoreSurroundingSpaces = ignoreSurroundingSpaces; - this.delimiterAsByte = delimiterAsByte; this.utf32CountingMode = utf32CountingMode; this.rowText = new ByteSlice(); this.needsUnderlyingRefresh = true; @@ -79,7 +77,7 @@ public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean en endOfInput.setValue(false); if (ignoreSurroundingSpaces) { - dest.trimPadding(delimiterAsByte); + ReaderUtil.trimWhitespace(dest); } } diff --git a/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java b/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java index e14e1d5..35cbb96 100644 --- a/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java +++ b/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java @@ -44,13 +44,12 @@ public static String[] determineHeadersToUse( } --skipCount; } - final byte paddingByte = (byte) specs.delimiter(); if (columnWidthsToUse.length == 0) { - columnWidthsToUse = inferColumnWidths(headerRow, paddingByte, specs.useUtf32CountingConvention()); + columnWidthsToUse = inferColumnWidths(headerRow, specs.useUtf32CountingConvention()); } headersToUse = - extractHeaders(headerRow, columnWidthsToUse, paddingByte, specs.useUtf32CountingConvention()); + extractHeaders(headerRow, columnWidthsToUse, specs.useUtf32CountingConvention()); } else { if (columnWidthsToUse.length == 0) { throw new CsvReaderException( @@ -81,17 +80,16 @@ public static String[] determineHeadersToUse( /** * Infer the column widths by looking for the transition from delimiter char to non-delimiter char. * @param row The input row - * @param delimiterAsByte The delimiter. As elsewhere, only 7-bit ASCII delimiters are supported. * @param useUtf32CountingConvention The character set convention we are using for units of width (either UTF-32 or * UTF-16) * @return The widths of the columns, in the specified character set convention. */ - private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, boolean useUtf32CountingConvention) { + private static int[] inferColumnWidths(ByteSlice row, boolean useUtf32CountingConvention) { // A column start is a non-delimiter character preceded by a delimiter (or present at the start of line). // If the start of the line is a delimiter, that is an error. final List columnWidths = new ArrayList<>(); final MutableInt charCountResult = new MutableInt(); - boolean prevCharIsDelimiter = false; + boolean prevCharIsSpace = false; final byte[] data = row.data(); int numChars = 0; int currentIndex = row.begin(); @@ -102,17 +100,15 @@ private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, bool } // If this character is not a delimiter, but the previous one was, then this is the start of a new column. byte ch = data[currentIndex]; - boolean thisCharIsDelimiter = ch == delimiterAsByte; - if (currentIndex == row.begin() && thisCharIsDelimiter) { - throw new IllegalArgumentException( - String.format("Header row cannot start with the delimiter character '%c'", - (char) delimiterAsByte)); + boolean thisCharIsSpace = ch == ' '; + if (currentIndex == row.begin() && thisCharIsSpace) { + throw new IllegalArgumentException("Header row cannot start with a space"); } - if (!thisCharIsDelimiter && prevCharIsDelimiter) { + if (!thisCharIsSpace && prevCharIsSpace) { columnWidths.add(numChars); numChars = 0; } - prevCharIsDelimiter = thisCharIsDelimiter; + prevCharIsSpace = thisCharIsSpace; final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(ch, row.end() - currentIndex, useUtf32CountingConvention, charCountResult); currentIndex += utf8Length; @@ -124,12 +120,10 @@ private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, bool * Extract the headers names from 'row'. * @param row The header row * @param columnWidths The width of the columns, in the UTF-32 or UTF-16 counting convention. - * @param paddingByte The delimiter character * @param utf32CountingMode Whether we are in the UTF-32 or UTF-16 counting mode * @return The array of headers */ - private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte paddingByte, - boolean utf32CountingMode) { + private static String[] extractHeaders(ByteSlice row, int[] columnWidths, boolean utf32CountingMode) { final int numCols = columnWidths.length; if (numCols == 0) { return new String[0]; @@ -146,7 +140,7 @@ private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte p final int proposedEndByte = beginByte + byteWidths[colNum]; final int actualEndByte = Math.min(proposedEndByte, row.end()); tempSlice.reset(row.data(), beginByte, actualEndByte); - tempSlice.trimPadding(paddingByte); + ReaderUtil.trimWhitespace(tempSlice); result[colNum] = tempSlice.toString(); beginByte = actualEndByte; } diff --git a/src/test/java/io/deephaven/csv/CsvReaderTest.java b/src/test/java/io/deephaven/csv/CsvReaderTest.java index f5c3da4..701048c 100644 --- a/src/test/java/io/deephaven/csv/CsvReaderTest.java +++ b/src/test/java/io/deephaven/csv/CsvReaderTest.java @@ -1884,7 +1884,7 @@ public void bug212() throws CsvReaderException { + "argocd Active 5y18d kubernetes.io/metadata.name=argocd\n" + "beta Not Active 4y235d kubernetes.io/metadata.name=beta\n"; - final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ') + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true) .ignoreSurroundingSpaces(true).build(); final ColumnSet expected = ColumnSet.of( @@ -1917,7 +1917,7 @@ public void simpleFixedColumnWidths() throws CsvReaderException { Column.ofValues("SecurityId", 200, 300, 500)); final CsvSpecs specs = - defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build(); + defaultCsvBuilder().hasFixedWidthColumns(true).ignoreSurroundingSpaces(true).build(); invokeTest(specs, input, expected); } @@ -1943,7 +1943,7 @@ public void fixedColumnWidthsFullCell() throws CsvReaderException { Column.ofValues("SecurityId", 200, 300)); final CsvSpecs specs = - defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build(); + defaultCsvBuilder().hasFixedWidthColumns(true).ignoreSurroundingSpaces(true).build(); invokeTest(specs, input, expected); } @@ -1968,7 +1968,7 @@ public void fixedColumnWidthsShortRows(boolean allowMissingColumns) throws CsvRe Column.ofValues("Price", Sentinels.NULL_DOUBLE, 0.15, 0.18, Sentinels.NULL_DOUBLE), Column.ofValues("SecurityId", Sentinels.NULL_INT, 300, 500, Sentinels.NULL_INT)); - final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ') + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true) .ignoreSurroundingSpaces(true).allowMissingColumns(allowMissingColumns).build(); if (allowMissingColumns) { @@ -1979,32 +1979,6 @@ public void fixedColumnWidthsShortRows(boolean allowMissingColumns) throws CsvRe } } - /** - * We support other ASCII delimiters. In fixed-width mode, the meaning of "ignoreSurroundingSpaces" is expanded - * to mean "ignore surrounding delimiters". - */ - @Test - public void alternateDelimiter() throws CsvReaderException { - final String input = - "" - + "Sym___Type_____Price___SecurityId\n" - + "GOOG__Dividend_0.25____200\n" - + "T_____Dividend_0.15____300\n" - + "Z_____Dividend_0.18____500\n"; - - final ColumnSet expected = - ColumnSet.of( - Column.ofRefs("Sym", "GOOG", "T", "Z"), - Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"), - Column.ofValues("Price", 0.25, 0.15, 0.18), - Column.ofValues("SecurityId", 200, 300, 500)); - - final CsvSpecs specs = - defaultCsvBuilder().hasFixedWidthColumns(true).delimiter('_').ignoreSurroundingSpaces(true).build(); - - invokeTest(specs, input, expected); - } - /** * If there is no header row, the caller needs to specify column widths. */ @@ -2025,7 +1999,7 @@ public void noHeaderRowRequiresFixColumnWidthsSpecified(boolean specifyColumnWid Column.ofValues("Column4", 200, 300, 500)); final CsvSpecs.Builder specsBase = defaultCsvBuilder().hasFixedWidthColumns(true).hasHeaderRow(false) - .delimiter(' ').ignoreSurroundingSpaces(true); + .ignoreSurroundingSpaces(true); if (specifyColumnWidths) { final CsvSpecs specs = specsBase.fixedColumnWidths(Arrays.asList(6, 9, 8, 3)).build(); @@ -2063,7 +2037,7 @@ public void columnNamesMayBeSpecified(boolean specifyColumnNames) throws CsvRead Column.ofValues(expectedColumnNames[3], 200, 300, 500)); CsvSpecs.Builder specsBuilder = defaultCsvBuilder().hasFixedWidthColumns(true).hasHeaderRow(false) - .delimiter(' ').ignoreSurroundingSpaces(true).fixedColumnWidths(Arrays.asList(6, 9, 8, 3)); + .ignoreSurroundingSpaces(true).fixedColumnWidths(Arrays.asList(6, 9, 8, 3)); if (specifyColumnNames) { specsBuilder = specsBuilder.headers(Arrays.asList(expectedColumnNames)); @@ -2092,7 +2066,7 @@ public void countsBMPCharactersTheSame(boolean useUtf32CountingConvention) throw Column.ofValues("Price", 0.15, 0.18), Column.ofValues("SecurityId", 300, 500)); - final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ') + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true) .ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build(); invokeTest(specs, input, expected); @@ -2125,7 +2099,7 @@ public void countsNonBMPCharactersDifferently(boolean useUtf32CountingConvention Column.ofRefs("Type", "πŸ’“πŸ’•πŸ’–Dividend", "Dividend")); } - final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ') + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true) .ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build(); invokeTest(specs, input, expected); @@ -2166,7 +2140,7 @@ public void unicodeColumnHeaders(boolean useUtf32CountingConvention) throws CsvR Column.ofRefs("╔═╀═╗", "gh")); } - final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ') + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true) .ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build(); invokeTest(specs, input, expected); @@ -2206,14 +2180,12 @@ public void brokenSurrogatePair(boolean useUtf32CountingConvention) throws CsvRe Column.ofRefs("C2", "😻 πŸ§‘πŸ’“")); } - final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ') + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true) .ignoreSurroundingSpaces(false).useUtf32CountingConvention(useUtf32CountingConvention).build(); invokeTest(specs, input, expected); } - - private static final class RepeatingInputStream extends InputStream { private byte[] data; private final byte[] body;