Merge pull request #581 from JuliaIO/dev

* fix attribute loading * Update ci.yml * WIP: Mmappable Arrays (#582) * wip: mmappable arrays * tests * include mmap_test * disable broken mmap on windows * update warning test * downgrade testing (#547) * downgrade testing * bump compat for Requires * bump compat for FileIO * rm UUIDs * skip Mmap * remove test that only tests a FileIO feature --------- Co-authored-by: Jonas Isensee <[email protected]> * experimental disable commit (#544) * experimental disable commit * 1.6 compat and make disable_commit a field of the file * some code cleanup * get rid of StorageMessage structs * chunked storage fix * clean up filterpipeline * Draft: experimental plain reconstruction (#522) * experimental plain reconstruction * Upgrade * add test --------- Co-authored-by: Jonas Isensee <[email protected]>
JuliaIO · Aug 26, 2024 · 99782ad · 99782ad · JonasIsensee · Aug 26, 2024
2 parents 4387b69 + 5ec40d2
commit 99782ad
Show file tree

Hide file tree

Showing 24 changed files with 544 additions and 288 deletions.
diff --git a/.github/workflows/Downgrade.yml b/.github/workflows/Downgrade.yml
@@ -0,0 +1,28 @@
+name: Downgrade
+on:
+  pull_request:
+    branches:
+      - master
+    paths-ignore:
+      - 'docs/**'
+  push:
+    branches:
+      - master
+    paths-ignore:
+      - 'docs/**'
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        version: ['1']
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: ${{ matrix.version }}
+      - uses: cjdoris/julia-downgrade-compat-action@v1
+        with:
+          skip: Pkg,TOML,Mmap
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -3,9 +3,11 @@ on:
   pull_request:
     branches:
       - master
+      - dev
   push:
     branches:
       - master
+      - dev
     tags: '*'
 jobs:
   test:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+## 0.4.52
+ - fix attribute loading
+ - new features: `readmmap` `ismmappable`  and `allocate_early` (api experimental)
+ - adds Downgrade testing
+ - new feature: disable committing datatypes. (restrict to h5 numbers, strings, and arrays)
+ - internal cleanup
+ - new experimental feature: reconstruct all committed types as `NamedTuple`s
+
 ## 0.4.51
  - remove Unicode normalization support due to excessive performance loss
  - rework of header message internals

diff --git a/Project.toml b/Project.toml
@@ -1,27 +1,21 @@
 name = "JLD2"
 uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
-version = "0.4.51"
+version = "0.4.52"
 
 [deps]
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
-Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
-UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
-Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [compat]
-FileIO = "1"
+FileIO = "1.5"
 MacroTools = "0.5"
-Mmap = "1"
 OrderedCollections = "1"
 PrecompileTools = "1"
-Reexport = "1"
-Requires = "1"
+Requires = "1.3"
 TranscodingStreams = "0.9, 0.10, 0.11"
-UUIDs = "1"
 julia = "1.6"
diff --git a/src/JLD2.jl b/src/JLD2.jl
@@ -1,11 +1,10 @@
 module JLD2
 using OrderedCollections: OrderedDict
-using Reexport: @reexport
 using MacroTools: MacroTools, @capture
 using Mmap: Mmap
-using Unicode: Unicode
 using TranscodingStreams: TranscodingStreams
-@reexport using FileIO: load, save
+using FileIO: load, save
+export load, save
 using Requires: @require
 using PrecompileTools: @setup_workload, @compile_workload
 
@@ -106,9 +105,12 @@ mutable struct JLDFile{T<:IO}
     path::String
     writable::Bool
     written::Bool
+    plain::Bool
     compress#::Union{Bool,Symbol}
     mmaparrays::Bool
     n_times_opened::Int
+    # Experimental feature: disable committing structs
+    disable_commit::Bool
     datatype_locations::OrderedDict{RelOffset,CommittedDatatype}
     datatypes::Vector{H5Datatype}
     datatype_wsession::JLDWriteSession{Dict{UInt,RelOffset}}
@@ -124,11 +126,13 @@ mutable struct JLDFile{T<:IO}
     root_group::Group{JLDFile{T}}
     types_group::Group{JLDFile{T}}
     base_address::UInt64
+
 
     function JLDFile{T}(io::IO, path::AbstractString, writable::Bool, written::Bool,
+                        plain::Bool,
                         compress,#::Union{Bool,Symbol},
                         mmaparrays::Bool) where T
-        f = new(io, path, writable, written, compress, mmaparrays, 1,
+        f = new(io, path, writable, written, plain, compress, mmaparrays, 1, false,
             OrderedDict{RelOffset,CommittedDatatype}(), H5Datatype[],
             JLDWriteSession(), Dict{String,Any}(), IdDict(), IdDict(), Dict{RelOffset,WeakRef}(),
             DATA_START, Dict{RelOffset,GlobalHeap}(),
@@ -137,8 +141,8 @@ mutable struct JLDFile{T<:IO}
         f
     end
 end
-JLDFile(io::IO, path::AbstractString, writable::Bool, written::Bool, compress, mmaparrays::Bool) =
-    JLDFile{typeof(io)}(io, path, writable, written, compress, mmaparrays)
+JLDFile(io::IO, path::AbstractString, writable::Bool, written::Bool, plain::Bool, compress, mmaparrays::Bool) =
+    JLDFile{typeof(io)}(io, path, writable, written, plain, compress, mmaparrays)
 
 """
     fileoffset(f::JLDFile, x::RelOffset)
@@ -188,6 +192,7 @@ function jldopen(fname::AbstractString, wr::Bool, create::Bool, truncate::Bool,
                  mmaparrays::Bool=false,
                  typemap::Dict{String}=Dict{String,Any}(),
                  parallel_read::Bool=false,
+                 plain::Bool=false
                  ) where T<:Union{Type{IOStream},Type{MmapIO}}
     mmaparrays && @warn "mmaparrays keyword is currently ignored" maxlog=1
     verify_compressor(compress)
@@ -239,7 +244,7 @@ function jldopen(fname::AbstractString, wr::Bool, create::Bool, truncate::Bool,
         io = openfile(iotype, fname, wr, create, truncate, fallback)
         created = !exists || truncate
         rname = realpath(fname)
-        f = JLDFile(io, rname, wr, created, compress, mmaparrays)
+        f = JLDFile(io, rname, wr, created, plain, compress, mmaparrays)
 
         if !parallel_read
             OPEN_FILES[rname] = WeakRef(f)
@@ -481,8 +486,8 @@ printtoc(io::IO, f::JLDFile; numlines = typemax(Int64)) =
 
 
 
-include("headermessages.jl")
 include("object_headers.jl")
+include("headermessages.jl")
 include("groups.jl")
 include("dataspaces.jl")
 include("attributes.jl")

diff --git a/src/committed_datatype_introspection.jl b/src/committed_datatype_introspection.jl
@@ -56,7 +56,6 @@ function stringify_object(f, offset)
     dataspace = ReadDataspace()
     attrs = EMPTY_READ_ATTRIBUTES
     datatype::H5Datatype = PlaceholderH5Datatype()
-    chunked_storage::Bool = false
     layout::DataLayout = DataLayout(0,LcCompact,0,-1)
     filter_pipeline::FilterPipeline = FilterPipeline(Filter[])
     for msg in HeaderMessageIterator(f, offset)

diff --git a/src/compression.jl b/src/compression.jl
@@ -106,6 +106,15 @@ function get_compressor(::Bool)
     false, COMPRESSOR_TO_ID[:ZlibCompressor], m.ZlibCompressor()
 end
 
+function get_compressor(filter_id::UInt16)
+    modname, compressorname, decompressorname, = ID_TO_DECOMPRESSOR[filter_id]
+    invoke_again, m = checked_import(modname)
+    if invoke_again || !applicable(getproperty(m,compressorname))
+        _, compressor = Base.invokelatest(get_compressor, filter_id)
+        return true, compressor
+    end
+    return invoke_again, getproperty(m,compressorname)()
+end
 function get_decompressor(filter_id::UInt16)
     modname, compressorname, decompressorname, = ID_TO_DECOMPRESSOR[filter_id]
     invoke_again, m = checked_import(modname)
@@ -180,35 +189,15 @@ function write_chunked_storage_message( io::IO,
                                         elsize::Int,
                                         dims::NTuple{N,Int},
                                         filtered_size::Int,
-                                        offset::RelOffset) where N
-    jlwrite(io, HeaderMessage(HmDataLayout, chunked_storage_message_size(N) - jlsizeof(HeaderMessage), 0))
-    jlwrite(io, UInt8(4))                     # Version
-    jlwrite(io, UInt8(LcChunked))    # Layout Class
-    jlwrite(io, UInt8(2))                     # Flags (= SINGLE_INDEX_WITH_FILTER)
-    jlwrite(io, UInt8(N+1))                   # Dimensionality
-    jlwrite(io, UInt8(jlsizeof(Length)))        # Dimensionality Size
-    for i = N:-1:1
-        jlwrite(io, Length(dims[i]))          # Dimensions 1...N
-    end
-    jlwrite(io, Length(elsize))               # Element size (last dimension)
-    jlwrite(io, UInt8(1))                     # Chunk Indexing Type (= Single Chunk)
-    jlwrite(io, Length(filtered_size))        # Size of filtered chunk
-    jlwrite(io, UInt32(0))                    # Filters for chunk
-    jlwrite(io, offset)                       # Address
-end
-
-
-function write_compressed_data(cio, f, data, odr, wsession, filter_id, compressor)
-    write_filter_pipeline_message(cio, filter_id)
-
-    # deflate first
-    deflated = deflate_data(f, data, odr, wsession, compressor)
-
-    write_chunked_storage_message(cio, odr_sizeof(odr), size(data), length(deflated), h5offset(f, f.end_of_data))
-    jlwrite(f.io, end_checksum(cio))
-
-    f.end_of_data += length(deflated)
-    jlwrite(f.io, deflated)
+                                        data_address::RelOffset) where N
+    write_header_message(io, Val(HmDataLayout);
+        layout_class = LcChunked,
+        flags = 2,  # (= SINGLE_INDEX_WITH_FILTER)
+        dimensions = UInt64.((reverse(dims)..., elsize)), # Reversed dimensions with element size as last dim
+        chunk_indexing_type = 1,  # (= Single Chunk)
+        data_size = filtered_size,
+        filters = 0, # Filters for chunk
+        data_address)  
 end
 
 function decompress!(inptr::Ptr, data_length, element_size, n, decompressor::TranscodingStreams.Codec)

diff --git a/src/data/reconstructing_datatypes.jl b/src/data/reconstructing_datatypes.jl
@@ -98,6 +98,11 @@ function jltype(f::JLDFile, cdt::CommittedDatatype)
     end
 
     datatype = read_attr_data(f, julia_type_attr)
+    if f.plain && !(datatype isa Upgrade) && !(datatype <: Tuple) 
+        rr = jltype(f, dt)
+        return f.h5jltype[cdt] = rr
+    end
+
     if written_type_attr !== nothing
         # Custom serialization
         custom_datatype = read_attr_data(f, written_type_attr)
@@ -415,6 +420,9 @@ function jlconvert(rr::ReadRepresentation{T,DataTypeODR()},
     isunknowntype(m) && return m
     unknown_params && return UnknownType{m, Tuple{params...}}
     if hasparams
+        if f.plain && !(m === Tuple)
+            return Any
+        end
         try
             m = m{params...}
         catch e

diff --git a/src/data/writing_datatypes.jl b/src/data/writing_datatypes.jl
@@ -144,6 +144,9 @@ h5type(f::JLDFile, @nospecialize(x)) = h5type(f, writeas(typeof(x)), x)
 # Make a compound datatype from a set of names and types
 @nospecializeinfer  function commit_compound(f::JLDFile, names::AbstractVector{Symbol},
                          @nospecialize(writtenas::DataType), @nospecialize(readas::Type))
+    if f.disable_commit
+        throw(ArgumentError("Attempted to commit DataType $writtenas but committing is disabled."))
+    end
     types = writtenas.types
     offsets = Int[]
     h5names = Symbol[]
@@ -192,6 +195,9 @@ end
         @nospecialize(writeas::DataType),
         @nospecialize(readas::DataType),
         attributes::WrittenAttribute...)
+    if f.disable_commit
+        throw(ArgumentError("Attempted to commit DataType $readas but committing is disabled."))
+    end
     io = f.io
 
     # This needs to be written this way or type inference gets unhappy...
@@ -362,6 +368,9 @@ function h5fieldtype(f::JLDFile, ::Type{T}, readas::Type, ::Initialized) where T
     end
 
     @lookup_committed f DataType
+    if f.disable_commit
+        throw(ArgumentError("Attempted to commit DataType $readas but committing is disabled."))
+    end
     io = f.io
     offset = f.end_of_data
 

diff --git a/src/dataio.jl b/src/dataio.jl
@@ -227,6 +227,14 @@ function write_data(io::IOStream, f::JLDFile, data::Array{T}, odr::Type{T}, ::Re
     nothing
 end
 
+function write_data(io::IOStream, f::JLDFile, data, odr, _, wsession::JLDWriteSession)
+    buf = Vector{UInt8}(undef, odr_sizeof(odr))
+    cp = Ptr{Cvoid}(pointer(buf))
+    h5convert!(cp, odr, f, data, wsession)
+    unsafe_write(io, Ptr{UInt8}(pointer(buf)), odr_sizeof(odr))
+    nothing
+end
+
 function write_data(io::BufferedWriter, f::JLDFile, data::Array{T}, odr::S,
                     ::DataMode, wsession::JLDWriteSession) where {T,S}
     position = io.position[]

diff --git a/src/datalayouts.jl b/src/datalayouts.jl
@@ -1,31 +1,3 @@
-struct CompactStorageMessage
-    hm::HeaderMessage
-    version::UInt8
-    layout_class::LayoutClass
-    data_size::UInt16
-end
-define_packed(CompactStorageMessage)
-CompactStorageMessage(datasz::Int) =
-    CompactStorageMessage(
-            HeaderMessage(HmDataLayout, jlsizeof(CompactStorageMessage) - jlsizeof(HeaderMessage) + datasz, 0),
-            4, LcCompact, datasz
-    )
-
-struct ContiguousStorageMessage
-    hm::HeaderMessage
-    version::UInt8
-    layout_class::LayoutClass
-    address::RelOffset
-    data_size::Length
-end
-define_packed(ContiguousStorageMessage)
-ContiguousStorageMessage(datasz::Int, offset::RelOffset) =
-    ContiguousStorageMessage(
-        HeaderMessage(HmDataLayout, jlsizeof(ContiguousStorageMessage) - jlsizeof(HeaderMessage), 0),
-        4, LcContiguous, offset, datasz
-    )
-
-
 ## Left over header message parsing that does not have a good place.
 
 struct DataLayout
@@ -87,47 +59,21 @@ function FilterPipeline(msg_::Hmessage)
     nfilters = msg.nfilters
     io = msg.m.io
     seek(io, msg.m.address+2)
-    if version == 1
-        skip(io, 6)
-        filters = map(1:nfilters) do _
-            id = jlread(io, UInt16)
-            name_length = jlread(io, UInt16)
-            flags = jlread(io, UInt16)
-            nclient_vals = jlread(io, UInt16)
-            if iszero(name_length) 
-                name = ""
-            else
-                name = read_bytestring(io)
-                skip(io, 8-mod1(sizeof(name), 8)-1)
-            end
-            client_data = jlread(io, UInt32, nclient_vals)
-            isodd(nclient_vals) && skip(io, 4)
-            Filter(id, flags, name, client_data)
-        end
-        return FilterPipeline(filters)
-    elseif version == 2
-        filters = map(1:nfilters) do _
-            id = jlread(io, UInt16)
-            if id > 255
-                name_length = jlread(io, UInt16)
-                flags = jlread(io, UInt16)
-                nclient_vals = jlread(io, UInt16)
-                if iszero(name_length) 
-                    name = ""
-                else
-                    name = read_bytestring(io)
-                    skip(io, 8-mod1(sizeof(name), 8)-1)
-                end
-            else
-                name = ""
-                flags = jlread(io, UInt16)
-                nclient_vals = jlread(io, UInt16)
-            end
-            client_data = jlread(io, UInt32, nclient_vals)
-            Filter(id, flags, name, client_data)
+    version == 1 && skip(io, 6)
+    filters = map(1:nfilters) do _
+        id = jlread(io, UInt16)
+        name_length = (version == 2 && id < 255) ? zero(UInt16) : jlread(io, UInt16)
+        flags = jlread(io, UInt16)
+        nclient_vals = jlread(io, UInt16)
+        if iszero(name_length) 
+            name = ""
+        else
+            name = read_bytestring(io)
+            skip(io, 8-mod1(sizeof(name), 8)-1)
         end
-        return FilterPipeline(filters)
-    else
-        throw(UnsupportedVersionException("Filter Pipeline Message version $version is not implemented"))
+        client_data = jlread(io, UInt32, nclient_vals)
+        (version == 1 && isodd(nclient_vals)) && skip(io, 4)
+        Filter(id, flags, name, client_data)
     end
+    return FilterPipeline(filters)
 end