From aa001a26a14ff67b3fedfcb389447eeb8e655089 Mon Sep 17 00:00:00 2001 From: Mark Pflug Date: Tue, 26 Sep 2023 07:25:37 -0700 Subject: [PATCH] Lazy sst (#135) Make shared string loading lazy and only happen when the string is accessed. --- docs/ReleaseNotes.md | 2 + source/Sylvan.Data.Excel.Tests/CustomTests.cs | 15 +- .../ExcelDataWriterTests.cs | 2 +- .../ExternalDataTests.cs | 41 +++++ .../ExcelDataReader+FieldInfo.cs | 2 + source/Sylvan.Data.Excel/ExcelDataReader.cs | 14 +- .../Sylvan.Data.Excel.csproj | 1 - .../Xls/XlsWorkbookReader.cs | 6 + .../Xlsb/XlsbWorkbookReader.cs | 68 ++++--- .../Xlsx/XlsxWorkbookReader.cs | 168 +++++++++++------- 10 files changed, 222 insertions(+), 97 deletions(-) diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md index f7c7ad2..b84ddbd 100644 --- a/docs/ReleaseNotes.md +++ b/docs/ReleaseNotes.md @@ -3,6 +3,8 @@ _0.4.17_ - Exclude phonetic component when reading string values. - Allow invalid ref values, which Excel appears to treat as missing. +- SharedString tables are read lazily instead of eagerly for .xlsx and .xlsb files, allowing + faster access to early records in some cases. _0.4.16_ - Adds ExcelFileType class that exposes constants about supported Excel formats: extensions and content types. diff --git a/source/Sylvan.Data.Excel.Tests/CustomTests.cs b/source/Sylvan.Data.Excel.Tests/CustomTests.cs index 581dc24..eb8a2b9 100644 --- a/source/Sylvan.Data.Excel.Tests/CustomTests.cs +++ b/source/Sylvan.Data.Excel.Tests/CustomTests.cs @@ -143,14 +143,25 @@ public void EmptyInlineStr() [Fact] public void EmptyTrailingRow() { - // If the final (or trailing) row contains a shared string referencing - // an empty string, treat it as a null/empty value. + // In the case that there is a trailing row that references empty strings + // we will read it as a valid row. This should be uncommon enough that it won't affect anyone + // in practice. var reader = XlsxBuilder.Create(TestData.EmptySSTrailingRow, TestData.SharedStringEmpty); Assert.True(reader.Read()); Assert.Equal(3, reader.RowFieldCount); Assert.Equal("a", reader.GetString(0)); Assert.Equal("a", reader.GetString(1)); + + Assert.True(reader.Read()); + + for (int i = 0; i < reader.RowFieldCount; i++) + { + Assert.True(reader.IsDBNull(i)); + Assert.Equal("", reader.GetString(i)); + + } + Assert.False(reader.Read()); } diff --git a/source/Sylvan.Data.Excel.Tests/ExcelDataWriterTests.cs b/source/Sylvan.Data.Excel.Tests/ExcelDataWriterTests.cs index 71e24db..6e2e901 100644 --- a/source/Sylvan.Data.Excel.Tests/ExcelDataWriterTests.cs +++ b/source/Sylvan.Data.Excel.Tests/ExcelDataWriterTests.cs @@ -129,7 +129,7 @@ public void Violence() { w.Write(reader); } - Open(f); + //Open(f); Validate(f); } diff --git a/source/Sylvan.Data.Excel.Tests/ExternalDataTests.cs b/source/Sylvan.Data.Excel.Tests/ExternalDataTests.cs index 285eef9..0b3392e 100644 --- a/source/Sylvan.Data.Excel.Tests/ExternalDataTests.cs +++ b/source/Sylvan.Data.Excel.Tests/ExternalDataTests.cs @@ -41,6 +41,47 @@ public static IEnumerable GetInputs() } } + + [Fact] + public void AnalyzeFiles() + { + var root = Environment.GetEnvironmentVariable("SylvanExcelTestData"); + if (string.IsNullOrEmpty(root)) + return; + var files = Directory.EnumerateFiles(root, "*.xlsx"); + foreach (var file in files) + { + AnalyzeFile(file); + } + } + + void AnalyzeFile(string file) + { + try + { + //using var s = File.OpenRead(file); + //using var za = new ZipArchive(s, ZipArchiveMode.Read); + var edr = ExcelDataReader.Create(file); + while (edr.Read()) + { + for (int i = 0; i < edr.RowFieldCount; i++) + { + if (edr.GetExcelDataType(i) == ExcelDataType.String) + { + if (edr.GetString(i) == "") + { + o.WriteLine($"{Path.GetFileName(file)} {edr.RowNumber} {i}"); + } + } + } + } + } + catch (Exception e) + { + o.WriteLine($"{Path.GetFileName(file)} ERROR {e.Message}"); + } + } + [Fact] public void XmlCharRegex() { diff --git a/source/Sylvan.Data.Excel/ExcelDataReader+FieldInfo.cs b/source/Sylvan.Data.Excel/ExcelDataReader+FieldInfo.cs index 94655c8..ad89791 100644 --- a/source/Sylvan.Data.Excel/ExcelDataReader+FieldInfo.cs +++ b/source/Sylvan.Data.Excel/ExcelDataReader+FieldInfo.cs @@ -9,7 +9,9 @@ private protected struct FieldInfo public static readonly FieldInfo Null = default; public ExcelDataType type; + public bool isSS; public string? strValue; + public int ssIdx; public double numValue; public DateTime dtValue; public int xfIdx; diff --git a/source/Sylvan.Data.Excel/ExcelDataReader.cs b/source/Sylvan.Data.Excel/ExcelDataReader.cs index fdc17f9..6daae96 100644 --- a/source/Sylvan.Data.Excel/ExcelDataReader.cs +++ b/source/Sylvan.Data.Excel/ExcelDataReader.cs @@ -3,7 +3,6 @@ using System.Collections; using System.Collections.Generic; using System.Collections.ObjectModel; -using System.ComponentModel.Design; using System.Data; using System.Data.Common; using System.Globalization; @@ -857,9 +856,16 @@ public sealed override string GetString(int ordinal) case ExcelDataType.Numeric: return FormatVal(fi.xfIdx, fi.numValue); } - return fi.strValue ?? string.Empty; + return ProcString(in fi); } + string ProcString(in FieldInfo fi) + { + return (fi.isSS ? GetSharedString(fi.ssIdx) : fi.strValue) ?? string.Empty; + } + + private protected abstract string GetSharedString(int idx); + string FormatVal(int xfIdx, double val) { var fmtIdx = xfIdx >= this.xfMap.Length ? -1 : this.xfMap[xfIdx]; @@ -891,7 +897,7 @@ public sealed override double GetDouble(int ordinal) switch (cell.type) { case ExcelDataType.String: - return double.Parse(cell.strValue!, culture); + return double.Parse(ProcString(in cell), culture); case ExcelDataType.Numeric: return cell.numValue; case ExcelDataType.Error: @@ -924,7 +930,7 @@ public sealed override bool GetBoolean(int ordinal) var trueString = col?.TrueString ?? this.trueString; var falseString = col?.FalseString ?? this.falseString; - var strVal = fi.strValue; + var strVal = ProcString(in fi); var c = StringComparer.OrdinalIgnoreCase; if (trueString != null && c.Equals(strVal, trueString)) diff --git a/source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj b/source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj index 40b0a5b..15bd41b 100644 --- a/source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj +++ b/source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj @@ -4,7 +4,6 @@ net6.0;netstandard2.1;netstandard2.0 latest 0.4.17 - b0003 A cross-platform .NET library for reading Excel data files. excel;xls;xlsx;xlsb;datareader enable diff --git a/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs b/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs index ebe3a22..da39600 100644 --- a/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs +++ b/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs @@ -94,6 +94,12 @@ public override bool Read() return NextRow(); } + private protected override string GetSharedString(int idx) + { + // .xls eagerly loads the shared strings. + return sst[idx]; + } + public override int MaxFieldCount => 256; BOFType ReadBOF() diff --git a/source/Sylvan.Data.Excel/Xlsb/XlsbWorkbookReader.cs b/source/Sylvan.Data.Excel/Xlsb/XlsbWorkbookReader.cs index 52c5f37..de7311a 100644 --- a/source/Sylvan.Data.Excel/Xlsb/XlsbWorkbookReader.cs +++ b/source/Sylvan.Data.Excel/Xlsb/XlsbWorkbookReader.cs @@ -22,11 +22,17 @@ sealed class XlsbWorkbookReader : ExcelDataReader int parsedRowIndex = -1; int curFieldCount = -1; + readonly ZipArchiveEntry? sstPart; + Stream? sstStream; + RecordReader? sstReader; + int sstIdx = -1; + public override ExcelWorkbookType WorkbookType => ExcelWorkbookType.ExcelXml; public override void Close() { this.sheetStream?.Close(); + this.sstStream?.Close(); base.Close(); } @@ -53,7 +59,7 @@ public XlsbWorkbookReader(Stream stream, ExcelDataReaderOptions opts) : base(str var stylePart = package.GetEntry(stylesPartName); - sst = ReadSharedStrings(sharedStringsPartName); + this.sstPart = package.GetEntry(sharedStringsPartName); var sheetNameList = new List(); using (Stream sheetsStream = workbookPart.Open()) @@ -213,17 +219,17 @@ bool InitializeSheet() return true; } - string[] ReadSharedStrings(string sharedStringsPartName) + bool LoadSst(int idx) { - var ssPart = package.GetEntry(sharedStringsPartName); - if (ssPart == null) + var reader = this.sstReader; + if (sstPart == null) { - return Array.Empty(); + return false; } - using (var stream = ssPart.Open()) + if (reader == null) { - var reader = new RecordReader(stream); - + this.sstStream = sstPart.Open(); + reader = this.sstReader = new RecordReader(this.sstStream); reader.NextRecord(); if (reader.RecordType != RecordType.SSTBegin) throw new InvalidDataException(); @@ -231,23 +237,39 @@ string[] ReadSharedStrings(string sharedStringsPartName) int totalCount = reader.GetInt32(0); int count = reader.GetInt32(4); - var ss = new string[count]; + if (count > 128) + count = 128; + this.sst = new string[count]; + } + while (idx > this.sstIdx) + { + if (!reader.NextRecord() || reader.RecordType != RecordType.SSTItem) + { + throw new InvalidDataException(); + } - for (int i = 0; i < count; i++) + var flags = reader.GetByte(0); + var str = reader.GetString(1); + this.sstIdx++; + if (sstIdx >= this.sst.Length) { - reader.NextRecord(); - if (reader.RecordType != RecordType.SSTItem) - { - reader.DebugInfo("fail"); - throw new InvalidDataException(); - } + Array.Resize(ref sst, sst.Length * 2); + } + sst[sstIdx] = str; + } + return true; + } - var flags = reader.GetByte(0); - var str = reader.GetString(1); - ss[i] = str; + private protected override string GetSharedString(int idx) + { + if (this.sstIdx < idx) + { + if (!LoadSst(idx)) + { + throw new InvalidDataException(); } - return ss; } + return sst[idx]; } public override bool Read() @@ -427,7 +449,10 @@ static void EnsureCols(ref FieldInfo[] values, int c) case RecordType.CellIsst: type = ExcelDataType.String; var sstIdx = reader.GetInt32(8); - fi.strValue = sst[sstIdx]; + + fi.isSS = true; + fi.ssIdx = sstIdx; + //fi.strValue = sst[sstIdx]; notNull++; break; case RecordType.CellSt: @@ -443,7 +468,6 @@ static void EnsureCols(ref FieldInfo[] values, int c) break; } - fi.type = type; fi.xfIdx = sf; count = col + 1; diff --git a/source/Sylvan.Data.Excel/Xlsx/XlsxWorkbookReader.cs b/source/Sylvan.Data.Excel/Xlsx/XlsxWorkbookReader.cs index 4fc5276..55cacc5 100644 --- a/source/Sylvan.Data.Excel/Xlsx/XlsxWorkbookReader.cs +++ b/source/Sylvan.Data.Excel/Xlsx/XlsxWorkbookReader.cs @@ -40,6 +40,16 @@ sealed class XlsxWorkbookReader : ExcelDataReader public override ExcelWorkbookType WorkbookType => ExcelWorkbookType.ExcelXml; const string DefaultWorkbookPartName = "xl/workbook.xml"; + readonly ZipArchiveEntry? sstPart; + XmlReader? sstReader; + int sstIdx = -1; + + public override void Close() + { + this.reader?.Close(); + this.sstReader?.Close(); + base.Close(); + } public XlsxWorkbookReader(Stream iStream, ExcelDataReaderOptions opts) : base(iStream, opts) { @@ -61,11 +71,9 @@ public XlsxWorkbookReader(Stream iStream, ExcelDataReaderOptions opts) : base(iS var sheetRelMap = OpenPackaging.LoadWorkbookRelations(package, workbookPartName, ref stylesPartName, ref sharedStringsPartName); - var ssPart = package.FindEntry(sharedStringsPartName); + sstPart = package.FindEntry(sharedStringsPartName); var stylePart = package.FindEntry(stylesPartName); - LoadSharedStrings(ssPart); - using (Stream sheetsStream = workbookPart.Open()) { // quick and dirty, good enough, this doc should be small. @@ -193,6 +201,7 @@ private protected override bool OpenWorksheet(int sheetIdx) var settings = new XmlReaderSettings { CheckCharacters = false, + CloseInput = true, ValidationType = ValidationType.None, ValidationFlags = System.Xml.Schema.XmlSchemaValidationFlags.None, #if SPAN @@ -456,11 +465,11 @@ int ParseRowValues() if (n == "r") { len = reader.ReadValueChunk(valueBuffer, 0, valueBuffer.Length); - + if (CellPosition.TryParse(valueBuffer.AsSpan().ToParsable(0, len), out var pos)) { col = pos.Column; - } + } else { // if the cell ref is unparsable, Excel seems to treat it as missing. @@ -600,13 +609,15 @@ static CellType GetCellType(char[] b, int l) { throw new FormatException(); } - fi.strValue = GetSharedString(strIdx); + fi.isSS = true; + fi.ssIdx = strIdx; + //fi.strValue = GetSharedString(strIdx); } else { fi.strValue = string.Empty; } - fi.type = fi.strValue.Length == 0 ? ExcelDataType.Null : ExcelDataType.String; + fi.type = ExcelDataType.String; break; case CellType.String: if (reader.NodeType == XmlNodeType.Text) @@ -786,61 +797,6 @@ internal override DateTime GetDateTimeValue(int ordinal) public override int RowNumber => rowIndex + 1; - void LoadSharedStrings(ZipArchiveEntry? entry) - { - if (entry == null) - { - return; - } - using Stream ssStream = entry.Open(); - - var settings = new XmlReaderSettings - { - CheckCharacters = false, -#if SPAN - // name table optimization requires ROS - NameTable = new SharedStringsNameTable(), -#endif - }; - - using var reader = XmlReader.Create(ssStream, settings); - - while (reader.Read()) - { - if (reader.NodeType == XmlNodeType.Element && reader.LocalName == "sst") - { - break; - } - } - - var countStr = reader.GetAttribute("uniqueCount"); - - var count = 0; - if (!string.IsNullOrEmpty(countStr) && int.TryParse(countStr, out count) && count >= 0) - { - - } - else - { - // try to estimate the number of strings based on the entry size - // Estimate ~24 bytes per string record. - var estimatedCount = (int)(entry.Length / 24); - count = Math.Max(1, estimatedCount); - } - - var sstList = new List(count); - - while (reader.Read()) - { - if (reader.NodeType == XmlNodeType.Element && reader.LocalName == "si") - { - var str = ReadString(reader); - sstList.Add(str); - } - } - - this.sst = sstList.ToArray(); - } string ReadString(XmlReader reader) { @@ -858,7 +814,7 @@ string ReadString(XmlReader reader) int c = 0; while (reader.Read() && reader.Depth > depth) { - start: + start: if (reader.NodeType == XmlNodeType.Element && reader.LocalName == "rPh") { SkipSubtree(reader); @@ -922,11 +878,89 @@ string ReadString(XmlReader reader) return str; } - string GetSharedString(int i) + private protected override string GetSharedString(int idx) { - if ((uint)i >= sst.Length) - throw new ArgumentOutOfRangeException(nameof(i)); + if (this.sstIdx < idx) + { + if (!LoadSharedString(idx)) + { + throw new InvalidDataException(); + } + } + return sst[idx]; + } + + bool LoadSharedString(int i) + { + var reader = this.sstReader; + if (reader == null) + { + var sstStream = sstPart!.Open(); + var settings = new XmlReaderSettings + { + CloseInput = true, + CheckCharacters = false, +#if SPAN + // name table optimization requires ROS + NameTable = new SharedStringsNameTable(), +#endif + }; + + reader = this.sstReader = XmlReader.Create(sstStream, settings); + // advance to the content + while (reader.Read()) + { + if (reader.NodeType == XmlNodeType.Element && reader.LocalName == "sst") + { + break; + } + } + + var countStr = reader.GetAttribute("uniqueCount"); + + var count = 0; + if (!string.IsNullOrEmpty(countStr) && int.TryParse(countStr, out count) && count >= 0) + { - return sst[i]; + } + else + { + // try to estimate the number of strings based on the entry size + // Estimate ~24 bytes per string record. + var estimatedCount = (int)(sstPart.Length / 24); + count = Math.Max(1, estimatedCount); + } + if (count > 128) + count = 128; + this.sst = new string[count]; + } + + while (i > sstIdx) + { + if (reader.Read()) + { + if (reader.NodeType == XmlNodeType.Element && reader.LocalName == "si") + { + var str = ReadString(reader); + sstIdx++; + if (sstIdx >= sst.Length) + { + Array.Resize(ref sst, sst.Length * 2); + } + sst[sstIdx] = str; + + } + } + else + { + // a cell with an SST value reference out of bounds. + // this exception type is probably wrong + //y + //throw new ArgumentOutOfRangeException(nameof(i)); + return false; + } + } + + return true; } }