From e458b57c8650886db866e4e37a1ef3e4aa712c35 Mon Sep 17 00:00:00 2001 From: Mark Pflug Date: Wed, 14 Aug 2024 16:57:49 -0700 Subject: [PATCH] Xls label fix (#183) * Fix label (pre-biff8) reading * Add support for reading "Book" entry for xl95 support. * Fix reading label (String) values in biff5 (xl95) * Fix reading rich strings in xl95. --- docs/ReleaseNotes.md | 3 + .../ExternalDataTests.cs | 28 ++++- source/Sylvan.Data.Excel/ExcelDataReader.cs | 1 + .../Sylvan.Data.Excel.csproj | 2 +- .../Xls/XlsWorkbookReader+RecordReader.cs | 115 ++++++++---------- .../Xls/XlsWorkbookReader.cs | 60 +++++++-- 6 files changed, 127 insertions(+), 82 deletions(-) diff --git a/docs/ReleaseNotes.md b/docs/ReleaseNotes.md index d6a32d8..fb4b295 100644 --- a/docs/ReleaseNotes.md +++ b/docs/ReleaseNotes.md @@ -1,5 +1,8 @@ # Sylvan.Data.Excel Release Notes +_0.4.25_ +- Fix some issues with reading Excel 95 .xls files. + _0.4.24_ - Fix for reading certain .xls files. - Handle writing NaN and infinity values. diff --git a/source/Sylvan.Data.Excel.Tests/ExternalDataTests.cs b/source/Sylvan.Data.Excel.Tests/ExternalDataTests.cs index cf8aefd..779ce28 100644 --- a/source/Sylvan.Data.Excel.Tests/ExternalDataTests.cs +++ b/source/Sylvan.Data.Excel.Tests/ExternalDataTests.cs @@ -1,5 +1,6 @@ #if NETCOREAPP3_0_OR_GREATER +using Sylvan.Data.Csv; using System; using System.Collections.Generic; using System.Diagnostics; @@ -231,7 +232,6 @@ public void GetValue(string path) GetErrorAsNull = true }; var edr = ExcelDataReader.Create(path, opts); - do { while (edr.Read()) @@ -243,6 +243,32 @@ public void GetValue(string path) } } while (edr.NextResult()); } + + [Theory] + [MemberData(nameof(GetExcelFiles))] + public void ToCsv(string filename) + { + if (filename == null) return; + + var root = GetRootPath(); + var path = Path.Combine(root, filename); + + var opts = new ExcelDataReaderOptions + { + Schema = ExcelSchema.NoHeaders, + GetErrorAsNull = true + }; + var edr = ExcelDataReader.Create(path, opts); + + do + { + var outPath = $"{filename}-{edr.WorksheetName}.csv"; + var dir = Path.GetDirectoryName(outPath); + Directory.CreateDirectory(dir); + using var w = CsvDataWriter.Create($"{filename}-{edr.WorksheetName}.csv"); + w.Write(edr.AsVariableField(e => e.RowFieldCount)); + } while (edr.NextResult()); + } } #endif \ No newline at end of file diff --git a/source/Sylvan.Data.Excel/ExcelDataReader.cs b/source/Sylvan.Data.Excel/ExcelDataReader.cs index 907e908..1d6cddb 100644 --- a/source/Sylvan.Data.Excel/ExcelDataReader.cs +++ b/source/Sylvan.Data.Excel/ExcelDataReader.cs @@ -633,6 +633,7 @@ public sealed override object GetValue(int ordinal) var kind = fmt?.Kind ?? FormatKind.Number; switch (kind) { + case FormatKind.String: case FormatKind.Number: var doubleValue = GetDouble(ordinal); unchecked diff --git a/source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj b/source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj index 44059fd..4b9e20b 100644 --- a/source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj +++ b/source/Sylvan.Data.Excel/Sylvan.Data.Excel.csproj @@ -3,7 +3,7 @@ net6.0;netstandard2.1;netstandard2.0 latest - 0.4.24 + 0.4.25 A cross-platform .NET library for reading Excel data files. excel;xls;xlsx;xlsb;datareader enable diff --git a/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader+RecordReader.cs b/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader+RecordReader.cs index 5999137..4301fb3 100644 --- a/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader+RecordReader.cs +++ b/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader+RecordReader.cs @@ -115,57 +115,7 @@ public int ReadInt32() { return ReadByte() | ReadByte() << 8 | ReadByte() << 16 | ReadByte() << 24; } - - public string ReadString16() - { - if (bufferPos >= recordOff + recordLen) - { - var next = NextRecord(); - if (!next || Type != RecordType.Continue) - throw new InvalidDataException(); - } - - // the length of the string in *characters* - int len = ReadInt16(); - if (len < 0) - { - throw new InvalidDataException(); - } - byte options = ReadByte(); - - bool compressed = (options & 0x01) == 0; - bool asian = (options & 0x04) != 0; - bool rich = (options & 0x08) != 0; - - int richCount = 0; - if (rich) - richCount = ReadInt16(); - - int asianCount = 0; - if (asian) - asianCount = ReadInt32(); - - var str = ReadStringBuffer(len, compressed); - - var remain = richCount * 4 + asianCount; - - while (remain > 0) - { - var avail = recordOff + recordLen - bufferPos; - var c = Math.Min(remain, avail); - remain -= c; - bufferPos += c; - Assert(); - if (remain > 0) - { - var next = NextRecord(); - if (!next || Type != RecordType.Continue) - throw new InvalidDataException(); - } - } - - return str; - } + static readonly Encoding Encoding1252 = Encoding.GetEncoding(1252); @@ -238,20 +188,44 @@ internal string ReadStringBuffer(int charCount, bool compressed) public string ReadByteString(int lenSize) { - int len; - if (lenSize == 1) - len = ReadByte(); - else - len = ReadInt16(); - - ReadStringBuffer(len, true); - var str = new string(strBuffer, 0, len); - return str; + int len = + lenSize == 1 + ? ReadByte() + : ReadInt16(); + + return ReadStringBuffer(len, true); } public string ReadString8() { - int len = ReadByte(); + MaybeContinueString(); + var len = ReadByte(); + return ReadString(len); + } + + public string ReadString16() + { + MaybeContinueString(); + var len = ReadInt16(); + return ReadString(len); + } + + void MaybeContinueString() + { + if (bufferPos >= recordOff + recordLen) + { + var next = NextRecord(); + if (!next || Type != RecordType.Continue) + throw new InvalidDataException(); + } + } + + public string ReadString(int len) + { + if (len < 0) + { + throw new InvalidDataException(); + } byte options = ReadByte(); bool compressed = (options & 0x01) == 0; @@ -268,14 +242,21 @@ public string ReadString8() var str = ReadStringBuffer(len, compressed); - for (int i = 0; i < richCount; i++) - { - ReadInt32(); - } + var remain = richCount * 4 + asianCount; - for (int i = 0; i < asianCount; i++) + while (remain > 0) { - ReadByte(); + var avail = recordOff + recordLen - bufferPos; + var c = Math.Min(remain, avail); + remain -= c; + bufferPos += c; + Assert(); + if (remain > 0) + { + var next = NextRecord(); + if (!next || Type != RecordType.Continue) + throw new InvalidDataException(); + } } return str; diff --git a/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs b/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs index 942316d..270c366 100644 --- a/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs +++ b/source/Sylvan.Data.Excel/Xls/XlsWorkbookReader.cs @@ -32,7 +32,10 @@ public XlsSheetInfo(string name, int offset, bool hidden) : base(name, hidden) internal XlsWorkbookReader(Stream stream, ExcelDataReaderOptions options) : base(stream, options) { var pkg = new Ole2Package(stream); - var part = pkg.GetEntry("Workbook\0"); + var part = + pkg.GetEntry("Workbook\0") ?? + pkg.GetEntry("Book\0"); + if (part == null) throw new InvalidDataException(); var ps = part.Open(); @@ -254,15 +257,10 @@ int ParseXF() void ParseFormat() { int ifmt = reader.ReadInt16(); - string str; - if (biffVersion == 0x0500) - { - str = reader.ReadByteString(1); - } - else - { - str = reader.ReadString16(); - } + string str = + biffVersion == 0x0500 + ? reader.ReadByteString(1) + : reader.ReadString16(); if (formats.ContainsKey(ifmt)) { @@ -310,7 +308,38 @@ void ParseLabel() int rowIdx = reader.ReadUInt16(); int colIdx = reader.ReadUInt16(); int xfIdx = reader.ReadUInt16(); - string str = reader.ReadByteString(2); + int len = reader.ReadInt16(); + if (len > 255) throw new InvalidDataException(); + bool compressed = true; + if (biffVersion == 0x0500) + { + // apparently there are no flags in this version + } + else + { + byte flags = reader.ReadByte(); + compressed = (flags & 1) == 0; + } + + var str = reader.ReadStringBuffer(len, compressed); + SetRowData(colIdx, new FieldInfo(str)); + } + + void ParseRString() + { + int rowIdx = reader.ReadUInt16(); + int colIdx = reader.ReadUInt16(); + int xfIdx = reader.ReadUInt16(); + var len = reader.ReadInt16(); + var str = reader.ReadStringBuffer(len, true); + + // consume the formatting info + var x = reader.ReadByte(); + for (int i = 0; i < x; i++) + { + reader.ReadUInt16(); + } + SetRowData(colIdx, new FieldInfo(str)); } @@ -470,7 +499,10 @@ int NextRow() } else { - throw new InvalidDataException(); + peekRow = (ushort)(rowIndex + 1); + pendingRow = peekRow; + return 0; + //throw new InvalidDataException(); } } break; @@ -516,10 +548,12 @@ int NextRow() case RecordType.Formula: ParseFormula(); break; + case RecordType.RString: + ParseRString(); + break; case RecordType.Blank: case RecordType.BoolErr: case RecordType.MulBlank: - case RecordType.RString: break; case RecordType.Array: case RecordType.SharedFmla: