Add fixed-width column support

deephaven · Nov 5, 2024 · 6aee4ae · 6aee4ae
1 parent 1c52d59
commit 6aee4ae
Show file tree

Hide file tree

Showing 5 changed files with 695 additions and 7 deletions.
diff --git a/src/main/java/io/deephaven/csv/CsvSpecs.java b/src/main/java/io/deephaven/csv/CsvSpecs.java
@@ -117,6 +117,34 @@ public interface Builder {
          */
         Builder headerValidator(Predicate<String> headerValidator);
 
+        /**
+         * True if the input is organized into fixed width columns rather than delimited by a delimiter.
+         */
+        Builder hasFixedWidthColumns(boolean hasFixedWidthColumns);
+
+        /**
+         * When {@link #hasFixedWidthColumns} is set, the library either determines the column widths from the header
+         * row (provided {@link #hasHeaderRow} is set), or the column widths can be specified explictly by the caller.
+         * If the caller wants to specify them explicitly, they can use this method.
+         * 
+         * @param fixedColumnWidths The caller-specified widths of the columns.
+         */
+        Builder fixedColumnWidths(Iterable<Integer> fixedColumnWidths);
+
+        /**
+         * This setting controls what units fixed width columns are measured in. When true, fixed width columns are
+         * measured in Unicode code points. When false, fixed width columns are measured in UTF-16 units (aka Java
+         * chars). The difference arises when encountering characters outside the Unicode Basic Multilingual Plane. For
+         * example, the Unicode code point 💔 (U+1F494) is one Unicode code point, but takes two Java chars to
+         * represent. Along these lines, the string 💔💔💔 would fit in a column of width 3 when utf32CountingMode is
+         * true, but would require a column width of at least 6 when utf32CountingMode is false.
+         *
+         * The default setting of true is arguably more natural for users (the number of characters they see matches the
+         * visual width of the column). But some programs may want the value of false because they are counting Java
+         * chars.
+         */
+        Builder useUtf32CountingConvention(boolean useUtf32CountingConvention);
+
         /**
          * Number of data rows to skip before processing data. This is useful when you want to parse data in chunks.
          * Typically used together with {@link Builder#numRows}. Defaults to 0.
@@ -340,6 +368,30 @@ public Predicate<String> headerValidator() {
         return c -> true;
     }
 
+    /**
+     * See {@link Builder#hasFixedWidthColumns}.
+     */
+    @Default
+    public boolean hasFixedWidthColumns() {
+        return false;
+    }
+
+    /**
+     * See {@link Builder#fixedColumnWidths}.
+     */
+    @Default
+    public List<Integer> fixedColumnWidths() {
+        return Collections.emptyList();
+    }
+
+    /**
+     * See {@link Builder#useUtf32CountingConvention}.
+     */
+    @Default
+    public boolean useUtf32CountingConvention() {
+        return true;
+    }
+
     /**
      * See {@link Builder#skipRows}.
      */

diff --git a/src/main/java/io/deephaven/csv/reading/CsvReader.java b/src/main/java/io/deephaven/csv/reading/CsvReader.java
@@ -7,7 +7,9 @@
 import io.deephaven.csv.parsers.Parser;
 import io.deephaven.csv.reading.cells.CellGrabber;
 import io.deephaven.csv.reading.cells.DelimitedCellGrabber;
+import io.deephaven.csv.reading.cells.FixedCellGrabber;
 import io.deephaven.csv.reading.headers.DelimitedHeaderFinder;
+import io.deephaven.csv.reading.headers.FixedHeaderFinder;
 import io.deephaven.csv.sinks.Sink;
 import io.deephaven.csv.sinks.SinkFactory;
 import io.deephaven.csv.util.*;
@@ -63,7 +65,8 @@ private CsvReader() {}
      */
     public static Result read(final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory)
             throws CsvReaderException {
-        return delimitedReadLogic(specs, stream, sinkFactory);
+        return specs.hasFixedWidthColumns() ? fixedReadLogic(specs, stream, sinkFactory)
+                : delimitedReadLogic(specs, stream, sinkFactory);
     }
 
     private static Result delimitedReadLogic(
@@ -97,6 +100,16 @@ private static Result delimitedReadLogic(
         return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersToUse, sinkFactory);
     }
 
+    private static Result fixedReadLogic(
+            final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory) throws CsvReaderException {
+        final CellGrabber lineGrabber = FixedCellGrabber.makeLineGrabber(stream);
+        MutableObject<int[]> columnWidths = new MutableObject<>();
+        final String[] headers = FixedHeaderFinder.determineHeadersToUse(specs, lineGrabber, columnWidths);
+        final int numCols = headers.length;
+        final CellGrabber grabber = new FixedCellGrabber(lineGrabber, columnWidths.getValue(),
+                specs.ignoreSurroundingSpaces(), specs.useUtf32CountingConvention());
+        return commonReadLogic(specs, grabber, null, numCols, numCols, headers, sinkFactory);
+    }
 
     private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber, byte[][] optionalFirstDataRow,
             int numInputCols, int numOutputCols,

diff --git a/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java
@@ -0,0 +1,113 @@
+package io.deephaven.csv.reading.cells;
+
+import io.deephaven.csv.containers.ByteSlice;
+import io.deephaven.csv.reading.ReaderUtil;
+import io.deephaven.csv.util.CsvReaderException;
+import io.deephaven.csv.util.MutableBoolean;
+import io.deephaven.csv.util.MutableInt;
+
+import java.io.InputStream;
+
+/**
+ * This class uses an underlying DelimitedCellGrabber to grab whole lines at a time from the input stream, and then it
+ * breaks them into fixed-sized cells to return to the caller.
+ */
+public class FixedCellGrabber implements CellGrabber {
+    /**
+     * Makes a degenerate CellGrabber that has no delimiters or quotes and therefore returns whole lines. This is a
+     * somewhat quick-and-dirty way to reuse the buffering and newline logic in DelimitedCellGrabber without rewriting
+     * it.
+     * 
+     * @param stream The underlying stream.
+     * @return The "line grabber"
+     */
+    public static CellGrabber makeLineGrabber(InputStream stream) {
+        final byte IllegalUtf8 = (byte) 0xff;
+        return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, true, false);
+    }
+
+    private final CellGrabber lineGrabber;
+    private final int[] columnWidths;
+    private final boolean ignoreSurroundingSpaces;
+    private final boolean utf32CountingMode;
+    private final ByteSlice rowText;
+    private boolean needsUnderlyingRefresh;
+    private int colIndex;
+    private final MutableBoolean dummy1;
+    private final MutableInt dummy2;
+
+    /** Constructor. */
+    public FixedCellGrabber(final CellGrabber lineGrabber, final int[] columnWidths, boolean ignoreSurroundingSpaces,
+            boolean utf32CountingMode) {
+        this.lineGrabber = lineGrabber;
+        this.columnWidths = columnWidths;
+        this.ignoreSurroundingSpaces = ignoreSurroundingSpaces;
+        this.utf32CountingMode = utf32CountingMode;
+        this.rowText = new ByteSlice();
+        this.needsUnderlyingRefresh = true;
+        this.colIndex = 0;
+        this.dummy1 = new MutableBoolean();
+        this.dummy2 = new MutableInt();
+    }
+
+    @Override
+    public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean endOfInput)
+            throws CsvReaderException {
+        if (needsUnderlyingRefresh) {
+            // Underlying row used up, and all columns provided. Ask underlying CellGrabber for the next line.
+            lineGrabber.grabNext(rowText, dummy1, endOfInput);
+
+            if (endOfInput.booleanValue()) {
+                // Set dest to the empty string, and leave 'endOfInput' set to true.
+                dest.reset(rowText.data(), rowText.end(), rowText.end());
+                return;
+            }
+
+            needsUnderlyingRefresh = false;
+            colIndex = 0;
+        }
+
+        // There is data to return. Count off N characters. The final column gets all remaining characters.
+        final boolean lastCol = colIndex == columnWidths.length - 1;
+        final int numCharsToTake = lastCol ? Integer.MAX_VALUE : columnWidths[colIndex];
+        takeNCharactersInCharset(rowText, dest, numCharsToTake, utf32CountingMode, dummy2);
+        ++colIndex;
+        needsUnderlyingRefresh = lastCol || dest.size() == 0;
+        lastInRow.setValue(needsUnderlyingRefresh);
+        endOfInput.setValue(false);
+
+        if (ignoreSurroundingSpaces) {
+            ReaderUtil.trimSpacesAndTabs(dest);
+        }
+    }
+
+    private static void takeNCharactersInCharset(ByteSlice src, ByteSlice dest, int numCharsToTake,
+            boolean utf32CountingMode, MutableInt tempInt) {
+        final byte[] data = src.data();
+        final int cellBegin = src.begin();
+        int current = cellBegin;
+        while (numCharsToTake > 0) {
+            if (current == src.end()) {
+                break;
+            }
+            final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(data[current], src.end() - current,
+                    utf32CountingMode, tempInt);
+            if (numCharsToTake < tempInt.intValue()) {
+                // There is not enough space left in the field to store this character.
+                // This can happen if CsvSpecs is set for the UTF16 counting convention,
+                // there is one unit left in the field, and we encounter a character outside
+                // the Basic Multilingual Plane, which would require two units.
+                break;
+            }
+            numCharsToTake -= tempInt.intValue();
+            current += utf8Length;
+        }
+        dest.reset(src.data(), cellBegin, current);
+        src.reset(src.data(), current, src.end());
+    }
+
+    @Override
+    public int physicalRowNum() {
+        return lineGrabber.physicalRowNum();
+    }
+}