For fixed-width columns, the delimiter is the space character.

It is no longer parameterizeable
deephaven · Nov 5, 2024 · f3cb1c1 · f3cb1c1
1 parent c77fc4c
commit f3cb1c1
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 60 deletions.
diff --git a/src/main/java/io/deephaven/csv/reading/CsvReader.java b/src/main/java/io/deephaven/csv/reading/CsvReader.java
@@ -107,7 +107,7 @@ private static Result fixedReadLogic(
         final String[] headers = FixedHeaderFinder.determineHeadersToUse(specs, lineGrabber, columnWidths);
         final int numCols = headers.length;
         final CellGrabber grabber = new FixedCellGrabber(lineGrabber, columnWidths.getValue(),
-                specs.ignoreSurroundingSpaces(), (byte)specs.delimiter(), specs.useUtf32CountingConvention());
+                specs.ignoreSurroundingSpaces(), specs.useUtf32CountingConvention());
         return commonReadLogic(specs, grabber, null, numCols, numCols, headers, sinkFactory);
     }
 

diff --git a/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java
@@ -29,7 +29,6 @@ public static CellGrabber makeLineGrabber(InputStream stream) {
     private final CellGrabber lineGrabber;
     private final int[] columnWidths;
     private final boolean ignoreSurroundingSpaces;
-    private final byte delimiterAsByte;
     private final boolean utf32CountingMode;
     private final ByteSlice rowText;
     private boolean needsUnderlyingRefresh;
@@ -39,11 +38,10 @@ public static CellGrabber makeLineGrabber(InputStream stream) {
 
     /** Constructor. */
     public FixedCellGrabber(final CellGrabber lineGrabber, final int[] columnWidths, boolean ignoreSurroundingSpaces,
-            byte delimiterAsByte, boolean utf32CountingMode) {
+            boolean utf32CountingMode) {
         this.lineGrabber = lineGrabber;
         this.columnWidths = columnWidths;
         this.ignoreSurroundingSpaces = ignoreSurroundingSpaces;
-        this.delimiterAsByte = delimiterAsByte;
         this.utf32CountingMode = utf32CountingMode;
         this.rowText = new ByteSlice();
         this.needsUnderlyingRefresh = true;
@@ -79,7 +77,7 @@ public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean en
         endOfInput.setValue(false);
 
         if (ignoreSurroundingSpaces) {
-            dest.trimPadding(delimiterAsByte);
+            ReaderUtil.trimWhitespace(dest);
         }
     }
 

diff --git a/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java b/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java
@@ -44,13 +44,12 @@ public static String[] determineHeadersToUse(
                 }
                 --skipCount;
             }
-            final byte paddingByte = (byte) specs.delimiter();
             if (columnWidthsToUse.length == 0) {
-                columnWidthsToUse = inferColumnWidths(headerRow, paddingByte, specs.useUtf32CountingConvention());
+                columnWidthsToUse = inferColumnWidths(headerRow, specs.useUtf32CountingConvention());
             }
 
             headersToUse =
-                    extractHeaders(headerRow, columnWidthsToUse, paddingByte, specs.useUtf32CountingConvention());
+                    extractHeaders(headerRow, columnWidthsToUse, specs.useUtf32CountingConvention());
         } else {
             if (columnWidthsToUse.length == 0) {
                 throw new CsvReaderException(
@@ -81,17 +80,16 @@ public static String[] determineHeadersToUse(
     /**
      * Infer the column widths by looking for the transition from delimiter char to non-delimiter char.
      * @param row The input row
-     * @param delimiterAsByte The delimiter. As elsewhere, only 7-bit ASCII delimiters are supported.
      * @param useUtf32CountingConvention The character set convention we are using for units of width (either UTF-32 or
      *                                   UTF-16)
      * @return The widths of the columns, in the specified character set convention.
      */
-    private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, boolean useUtf32CountingConvention) {
+    private static int[] inferColumnWidths(ByteSlice row, boolean useUtf32CountingConvention) {
         // A column start is a non-delimiter character preceded by a delimiter (or present at the start of line).
         // If the start of the line is a delimiter, that is an error.
         final List<Integer> columnWidths = new ArrayList<>();
         final MutableInt charCountResult = new MutableInt();
-        boolean prevCharIsDelimiter = false;
+        boolean prevCharIsSpace = false;
         final byte[] data = row.data();
         int numChars = 0;
         int currentIndex = row.begin();
@@ -102,17 +100,15 @@ private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, bool
             }
             // If this character is not a delimiter, but the previous one was, then this is the start of a new column.
             byte ch = data[currentIndex];
-            boolean thisCharIsDelimiter = ch == delimiterAsByte;
-            if (currentIndex == row.begin() && thisCharIsDelimiter) {
-                throw new IllegalArgumentException(
-                        String.format("Header row cannot start with the delimiter character '%c'",
-                                (char) delimiterAsByte));
+            boolean thisCharIsSpace = ch == ' ';
+            if (currentIndex == row.begin() && thisCharIsSpace) {
+                throw new IllegalArgumentException("Header row cannot start with a space");
             }
-            if (!thisCharIsDelimiter && prevCharIsDelimiter) {
+            if (!thisCharIsSpace && prevCharIsSpace) {
                 columnWidths.add(numChars);
                 numChars = 0;
             }
-            prevCharIsDelimiter = thisCharIsDelimiter;
+            prevCharIsSpace = thisCharIsSpace;
             final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(ch, row.end() - currentIndex,
                     useUtf32CountingConvention, charCountResult);
             currentIndex += utf8Length;
@@ -124,12 +120,10 @@ private static int[] inferColumnWidths(ByteSlice row, byte delimiterAsByte, bool
      * Extract the headers names from 'row'.
      * @param row The header row
      * @param columnWidths The width of the columns, in the UTF-32 or UTF-16 counting convention.
-     * @param paddingByte The delimiter character
      * @param utf32CountingMode Whether we are in the UTF-32 or UTF-16 counting mode
      * @return The array of headers
      */
-    private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte paddingByte,
-            boolean utf32CountingMode) {
+    private static String[] extractHeaders(ByteSlice row, int[] columnWidths, boolean utf32CountingMode) {
         final int numCols = columnWidths.length;
         if (numCols == 0) {
             return new String[0];
@@ -146,7 +140,7 @@ private static String[] extractHeaders(ByteSlice row, int[] columnWidths, byte p
             final int proposedEndByte = beginByte + byteWidths[colNum];
             final int actualEndByte = Math.min(proposedEndByte, row.end());
             tempSlice.reset(row.data(), beginByte, actualEndByte);
-            tempSlice.trimPadding(paddingByte);
+            ReaderUtil.trimWhitespace(tempSlice);
             result[colNum] = tempSlice.toString();
             beginByte = actualEndByte;
         }

diff --git a/src/test/java/io/deephaven/csv/CsvReaderTest.java b/src/test/java/io/deephaven/csv/CsvReaderTest.java
@@ -1884,7 +1884,7 @@ public void bug212() throws CsvReaderException {
                         + "argocd                   Active       5y18d    kubernetes.io/metadata.name=argocd\n"
                         + "beta                     Not Active   4y235d   kubernetes.io/metadata.name=beta\n";
 
-        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
+        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true)
                 .ignoreSurroundingSpaces(true).build();
 
         final ColumnSet expected = ColumnSet.of(
@@ -1917,7 +1917,7 @@ public void simpleFixedColumnWidths() throws CsvReaderException {
                         Column.ofValues("SecurityId", 200, 300, 500));
 
         final CsvSpecs specs =
-                defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build();
+                defaultCsvBuilder().hasFixedWidthColumns(true).ignoreSurroundingSpaces(true).build();
 
         invokeTest(specs, input, expected);
     }
@@ -1943,7 +1943,7 @@ public void fixedColumnWidthsFullCell() throws CsvReaderException {
                         Column.ofValues("SecurityId", 200, 300));
 
         final CsvSpecs specs =
-                defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ').ignoreSurroundingSpaces(true).build();
+                defaultCsvBuilder().hasFixedWidthColumns(true).ignoreSurroundingSpaces(true).build();
         invokeTest(specs, input, expected);
     }
 
@@ -1968,7 +1968,7 @@ public void fixedColumnWidthsShortRows(boolean allowMissingColumns) throws CsvRe
                         Column.ofValues("Price", Sentinels.NULL_DOUBLE, 0.15, 0.18, Sentinels.NULL_DOUBLE),
                         Column.ofValues("SecurityId", Sentinels.NULL_INT, 300, 500, Sentinels.NULL_INT));
 
-        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
+        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true)
                 .ignoreSurroundingSpaces(true).allowMissingColumns(allowMissingColumns).build();
 
         if (allowMissingColumns) {
@@ -1979,32 +1979,6 @@ public void fixedColumnWidthsShortRows(boolean allowMissingColumns) throws CsvRe
         }
     }
 
-    /**
-     * We support other ASCII delimiters. In fixed-width mode, the meaning of "ignoreSurroundingSpaces" is expanded
-     * to mean "ignore surrounding delimiters".
-     */
-    @Test
-    public void alternateDelimiter() throws CsvReaderException {
-        final String input =
-                ""
-                        + "Sym___Type_____Price___SecurityId\n"
-                        + "GOOG__Dividend_0.25____200\n"
-                        + "T_____Dividend_0.15____300\n"
-                        + "Z_____Dividend_0.18____500\n";
-
-        final ColumnSet expected =
-                ColumnSet.of(
-                        Column.ofRefs("Sym", "GOOG", "T", "Z"),
-                        Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"),
-                        Column.ofValues("Price", 0.25, 0.15, 0.18),
-                        Column.ofValues("SecurityId", 200, 300, 500));
-
-        final CsvSpecs specs =
-                defaultCsvBuilder().hasFixedWidthColumns(true).delimiter('_').ignoreSurroundingSpaces(true).build();
-
-        invokeTest(specs, input, expected);
-    }
-
     /**
      * If there is no header row, the caller needs to specify column widths.
      */
@@ -2025,7 +1999,7 @@ public void noHeaderRowRequiresFixColumnWidthsSpecified(boolean specifyColumnWid
                         Column.ofValues("Column4", 200, 300, 500));
 
         final CsvSpecs.Builder specsBase = defaultCsvBuilder().hasFixedWidthColumns(true).hasHeaderRow(false)
-                .delimiter(' ').ignoreSurroundingSpaces(true);
+                .ignoreSurroundingSpaces(true);
 
         if (specifyColumnWidths) {
             final CsvSpecs specs = specsBase.fixedColumnWidths(Arrays.asList(6, 9, 8, 3)).build();
@@ -2063,7 +2037,7 @@ public void columnNamesMayBeSpecified(boolean specifyColumnNames) throws CsvRead
                         Column.ofValues(expectedColumnNames[3], 200, 300, 500));
 
         CsvSpecs.Builder specsBuilder = defaultCsvBuilder().hasFixedWidthColumns(true).hasHeaderRow(false)
-                .delimiter(' ').ignoreSurroundingSpaces(true).fixedColumnWidths(Arrays.asList(6, 9, 8, 3));
+                .ignoreSurroundingSpaces(true).fixedColumnWidths(Arrays.asList(6, 9, 8, 3));
 
         if (specifyColumnNames) {
             specsBuilder = specsBuilder.headers(Arrays.asList(expectedColumnNames));
@@ -2092,7 +2066,7 @@ public void countsBMPCharactersTheSame(boolean useUtf32CountingConvention) throw
                         Column.ofValues("Price", 0.15, 0.18),
                         Column.ofValues("SecurityId", 300, 500));
 
-        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
+        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true)
                 .ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build();
 
         invokeTest(specs, input, expected);
@@ -2125,7 +2099,7 @@ public void countsNonBMPCharactersDifferently(boolean useUtf32CountingConvention
                     Column.ofRefs("Type", "💓💕💖Dividend", "Dividend"));
         }
 
-        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
+        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true)
                 .ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build();
 
         invokeTest(specs, input, expected);
@@ -2166,7 +2140,7 @@ public void unicodeColumnHeaders(boolean useUtf32CountingConvention) throws CsvR
                     Column.ofRefs("╔═╤═╗", "gh"));
         }
 
-        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
+        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true)
                 .ignoreSurroundingSpaces(true).useUtf32CountingConvention(useUtf32CountingConvention).build();
 
         invokeTest(specs, input, expected);
@@ -2206,14 +2180,12 @@ public void brokenSurrogatePair(boolean useUtf32CountingConvention) throws CsvRe
                     Column.ofRefs("C2", "😻 🧡💓"));
         }
 
-        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).delimiter(' ')
+        final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true)
                 .ignoreSurroundingSpaces(false).useUtf32CountingConvention(useUtf32CountingConvention).build();
 
         invokeTest(specs, input, expected);
     }
 
-
-
     private static final class RepeatingInputStream extends InputStream {
         private byte[] data;
         private final byte[] body;