From aa94060a87be3fad8ba29ebbecec1cdffad2776d Mon Sep 17 00:00:00 2001 From: Jonas Isensee Date: Tue, 30 Jul 2024 19:14:05 +0200 Subject: [PATCH 01/11] fix attribute loading --- src/explicit_datasets.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/explicit_datasets.jl b/src/explicit_datasets.jl index bd0170b4..d3374962 100644 --- a/src/explicit_datasets.jl +++ b/src/explicit_datasets.jl @@ -408,7 +408,7 @@ end function attributes(dset::Dataset; plain::Bool=false) plain && return dset.attributes - map(values(dset.attributes)) do attr + OrderedDict(keys(dset.attributes) .=> map(values(dset.attributes)) do attr read_attr_data(dset.parent.f, attr) - end + end) end \ No newline at end of file From 7f792e894e4f7fff39f6600d0ad47ca066537780 Mon Sep 17 00:00:00 2001 From: JonasIsensee Date: Tue, 20 Aug 2024 08:40:13 +0200 Subject: [PATCH 02/11] Update ci.yml --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 559e9b20..d31f815b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,9 +3,11 @@ on: pull_request: branches: - master + - dev push: branches: - master + - dev tags: '*' jobs: test: From cc86e5d256bf3406fd55a7a03f67b37e80a28bca Mon Sep 17 00:00:00 2001 From: JonasIsensee Date: Fri, 23 Aug 2024 08:52:26 +0200 Subject: [PATCH 03/11] WIP: Mmappable Arrays (#582) * wip: mmappable arrays * tests * include mmap_test * disable broken mmap on windows * update warning test --- Project.toml | 3 - src/JLD2.jl | 5 +- src/dataio.jl | 8 ++ src/datasets.jl | 11 +-- src/explicit_datasets.jl | 155 ++++++++++++++++++++++++++++++++++++--- test/mmap_test.jl | 88 ++++++++++++++++++++++ test/runtests.jl | 3 +- 7 files changed, 250 insertions(+), 23 deletions(-) create mode 100644 test/mmap_test.jl diff --git a/Project.toml b/Project.toml index 53f69d91..f18418c0 100644 --- a/Project.toml +++ b/Project.toml @@ -8,11 +8,9 @@ MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" Mmap = "a63ad114-7e13-5084-954f-fe012c677804" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" -Reexport = "189a3867-3050-52da-a836-e630ba90ab69" Requires = "ae029012-a4dd-5104-9daa-d747884805df" TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" -Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] FileIO = "1" @@ -20,7 +18,6 @@ MacroTools = "0.5" Mmap = "1" OrderedCollections = "1" PrecompileTools = "1" -Reexport = "1" Requires = "1" TranscodingStreams = "0.9, 0.10, 0.11" UUIDs = "1" diff --git a/src/JLD2.jl b/src/JLD2.jl index 54e5eeaa..b03c71e9 100644 --- a/src/JLD2.jl +++ b/src/JLD2.jl @@ -1,11 +1,10 @@ module JLD2 using OrderedCollections: OrderedDict -using Reexport: @reexport using MacroTools: MacroTools, @capture using Mmap: Mmap -using Unicode: Unicode using TranscodingStreams: TranscodingStreams -@reexport using FileIO: load, save +using FileIO: load, save +export load, save using Requires: @require using PrecompileTools: @setup_workload, @compile_workload diff --git a/src/dataio.jl b/src/dataio.jl index 3941bedd..66b0693d 100644 --- a/src/dataio.jl +++ b/src/dataio.jl @@ -227,6 +227,14 @@ function write_data(io::IOStream, f::JLDFile, data::Array{T}, odr::Type{T}, ::Re nothing end +function write_data(io::IOStream, f::JLDFile, data, odr, _, wsession::JLDWriteSession) + buf = Vector{UInt8}(undef, odr_sizeof(odr)) + cp = Ptr{Cvoid}(pointer(buf)) + h5convert!(cp, odr, f, data, wsession) + unsafe_write(io, Ptr{UInt8}(pointer(buf)), odr_sizeof(odr)) + nothing +end + function write_data(io::BufferedWriter, f::JLDFile, data::Array{T}, odr::S, ::DataMode, wsession::JLDWriteSession) where {T,S} position = io.position[] diff --git a/src/datasets.jl b/src/datasets.jl index e27cd5ee..2cdcab85 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -226,7 +226,7 @@ end get_ndims_offset(f::JLDFile, dataspace::ReadDataspace, attributes::Nothing) = (dataspace.dimensionality, dataspace.dimensions_offset) -function get_ndims_offset(f::JLDFile, dataspace::ReadDataspace, attributes::Vector{ReadAttribute}) +function get_ndims_offset(f::JLDFile, dataspace::ReadDataspace, attributes::AbstractVector) ndims = dataspace.dimensionality offset = dataspace.dimensions_offset if !isempty(attributes) @@ -363,8 +363,7 @@ end psz += CONTINUATION_MSG_SIZE # Figure out the layout - # The simplest CompactStorageMessage only supports data sets < 2^16 - if datasz < 8192 || (!(data isa Array) && datasz < typemax(UInt16)) + if datasz == 0 || (!(data isa Array) && datasz < 8192) layout_class = LcCompact psz += jlsizeof(CompactStorageMessage) + datasz elseif data isa Array && compress != false && isconcretetype(eltype(data)) && isbitstype(eltype(data)) @@ -420,11 +419,13 @@ end f.end_of_data += length(deflated) jlwrite(f.io, deflated) else - jlwrite(cio, ContiguousStorageMessage(datasz, h5offset(f, f.end_of_data))) + data_address = f.end_of_data + 8 - mod1(f.end_of_data, 8) + jlwrite(cio, ContiguousStorageMessage(datasz, h5offset(f, data_address))) jlwrite(cio, CONTINUATION_PLACEHOLDER) jlwrite(io, end_checksum(cio)) - f.end_of_data += datasz + f.end_of_data = data_address + datasz + seek(io, data_address) write_data(io, f, data, odr, datamode(odr), wsession) end diff --git a/src/explicit_datasets.jl b/src/explicit_datasets.jl index d3374962..06f55f6c 100644 --- a/src/explicit_datasets.jl +++ b/src/explicit_datasets.jl @@ -12,18 +12,35 @@ mutable struct Dataset header_chunk_info # chunk_start, chunk_end, next_msg_offset end + +""" + create_dataset(parent, path, datatype, dataspace; kwargs...) + +Arguments: + - `parent::Union{JLDfile, Group}`: Containing group of new dataset + - `path`: Path to new dataset relative to `parent`. If `path` is `nothing`, the dataset is unnamed. + - `datatype`: Datatype of new dataset (element type in case of arrays) + - `dataspace`: Dimensions or `Dataspace` of new dataset + +Keyword arguments: + - `layout`: `DataLayout` of new dataset + - `filters`: `FilterPipeline` for describing the compression pipeline +""" create_dataset(f::JLDFile, args...; kwargs...) = create_dataset(f.root_group, args...; kwargs...) function create_dataset( - parent::Group, - name::Union{Nothing,String}, + g::Group, + path::Union{Nothing,String}, datatype=nothing, dataspace=nothing; layout = nothing, chunk=nothing, - filters=Filter[], + filters=FilterPipeline(), ) - if !isnothing(name) - (parent, name) = pathize(parent, name, true) + if !isnothing(path) + (parent, name) = pathize(g, path, true) + else + name = "" + parent = g.f end return Dataset(parent, name, UNDEFINED_ADDRESS, datatype, dataspace, @@ -119,6 +136,7 @@ function write_dataset(dataset::Dataset, data) throw(ArgumentError("Invalid attribute: $a")) end io = f.io + odr = objodr(data) datasz = odr_sizeof(odr)::Int * numel(dataspace)::Int psz = payload_size_without_storage_message(dataspace, datatype)::Int @@ -131,11 +149,11 @@ function write_dataset(dataset::Dataset, data) # determine layout class # DataLayout object is only available after the data is written - if datasz < 8192 + if datasz == 0 || (!(data isa Array) && datasz < 8192) layout_class = LcCompact psz += jlsizeof(CompactStorageMessage) + datasz - elseif !isnothing(dataset.chunk) || !isempty(dataset.filters) + elseif !isnothing(dataset.chunk) || !isempty(dataset.filters.filters) # Do some additional checks on the data here layout_class = LcChunked # improve filter support here @@ -144,7 +162,7 @@ function write_dataset(dataset::Dataset, data) layout_class = LcContiguous psz += jlsizeof(ContiguousStorageMessage) end - fullsz = jlsizeof(ObjectStart) + size_size(psz) + psz + 4 # why do I need to correct here? + fullsz = jlsizeof(ObjectStart) + size_size(psz) + psz + 4 header_offset = f.end_of_data seek(io, header_offset) @@ -191,14 +209,18 @@ function write_dataset(dataset::Dataset, data) jlwrite(f.io, end_checksum(cio)) else - jlwrite(cio, ContiguousStorageMessage(datasz, h5offset(f, f.end_of_data))) + # Align contiguous chunk to 8 bytes in the file + address = f.end_of_data + 8 - mod1(f.end_of_data, 8) + offset = h5offset(f, address) + jlwrite(cio, ContiguousStorageMessage(datasz, offset)) dataset.header_chunk_info = (header_offset, position(cio)+20, position(cio)) # Add NIL message replacable by continuation message jlwrite(io, CONTINUATION_PLACEHOLDER) jlwrite(io, end_checksum(cio)) - f.end_of_data += datasz + f.end_of_data = address + datasz + seek(io, address) write_data(io, f, data, odr, datamode(odr), wsession) end @@ -243,7 +265,7 @@ function get_dataset(f::JLDFile, offset::RelOffset, g=f.root_group, name="") hmitr = HeaderMessageIterator(f, offset) for msg in hmitr if msg.type == HmDataspace - dset.dataspace = HmWrap(HmDataspace, msg)#ReadDataspace(f, msg) + dset.dataspace = HmWrap(HmDataspace, msg) elseif msg.type == HmDatatype dset.datatype = HmWrap(HmDatatype, msg).dt elseif msg.type == HmDataLayout @@ -411,4 +433,115 @@ function attributes(dset::Dataset; plain::Bool=false) OrderedDict(keys(dset.attributes) .=> map(values(dset.attributes)) do attr read_attr_data(dset.parent.f, attr) end) +end + +## Mmap Arrays +function ismmappable(dset::Dataset) + iswritten(dset) || return false + f = dset.parent.f + dt = dset.datatype + if dt isa SharedDatatype + rr = jltype(f, get(f.datatype_locations, dt.header_offset, dt)) + else + rr = jltype(f, dt) + end + T = typeof(rr).parameters[1] + !(samelayout(T)) && return false + !isempty(dset.filters.filters) && return false + ret = false + if (layout = dset.layout) isa HmWrap{HmDataLayout} + ret = (layout.layout_class == LcContiguous && layout.data_address != UNDEFINED_ADDRESS) + end + if ret == true && Sys.iswindows() && dset.parent.f.writable + @warn "On Windows memory-mapping is only possible for files in read-only mode." + ret = false + end + return ret +end + +function readmmap(dset::Dataset) + ismmappable(dset) || throw(ArgumentError("Dataset is not mmappable")) + f = dset.parent.f + + # figure out the element type + dt = dset.datatype + if dt isa SharedDatatype + rr = jltype(f, get(f.datatype_locations, dt.header_offset, dt)) + else + rr = jltype(f, dt) + end + T = typeof(rr).parameters[1] + ndims, offset = get_ndims_offset(f, ReadDataspace(f, dset.dataspace), collect(values(dset.attributes))) + + io = f.io + seek(io, offset) + dims = [jlread(io, Int64) for i in 1:ndims] + iobackend = io isa IOStream ? io : io.f + seek(iobackend, DataLayout(f, dset.layout).data_offset) + return Mmap.mmap(iobackend, Array{T, Int(ndims)}, (reverse(dims)..., )) +end + +@static if !Sys.iswindows() +function allocate_early(dset::Dataset, T::DataType) + iswritten(dset) && throw(ArgumentError("Dataset has already been written to file")) + # for this to work, require all information to be provided + isnothing(dset.datatype) && throw(ArgumentError("datatype must be provided")) + isnothing(dset.dataspace) && throw(ArgumentError("dataspace must be provided")) + datatype = dset.datatype + dataspace = dset.dataspace + + f = dset.parent.f + attributes = map(collect(dset.attributes)) do (name, attr) + attr isa WrittenAttribute && return attr + return WrittenAttribute(f, name, attr) + throw(ArgumentError("Invalid attribute: $a")) + end + writtenas = writeas(T) + odr_ = _odr(writtenas, T, odr(writtenas)) + datasz = odr_sizeof(odr_)::Int * numel(dataspace)::Int + psz = payload_size_without_storage_message(dataspace, datatype)::Int + psz += sum(message_size.(attributes), init=0) + # minimum extra space for continuation message + psz += jlsizeof(HeaderMessage) + jlsizeof(RelOffset) + jlsizeof(Length) + + # Layout class: Use contiguous for now + layout_class = LcContiguous + psz += jlsizeof(ContiguousStorageMessage) + fullsz = jlsizeof(ObjectStart) + size_size(psz) + psz + 4 + + header_offset = f.end_of_data + io = f.io + seek(io, header_offset) + f.end_of_data = header_offset + fullsz + + cio = begin_checksum_write(io, fullsz - 4) + write_object_header_and_dataspace_message(cio, f, psz, dataspace) + write_datatype_message(cio, datatype) + for a in attributes + write_message(cio, f, a, wsession) + end + # Align contiguous chunk to 8 bytes in the file + address = f.end_of_data + 8 - mod1(f.end_of_data, 8) + offset = h5offset(f, address) + jlwrite(cio, ContiguousStorageMessage(datasz, offset)) + + dset.header_chunk_info = (header_offset, position(cio)+20, position(cio)) + # Add NIL message replacable by continuation message + jlwrite(io, CONTINUATION_PLACEHOLDER) + jlwrite(io, end_checksum(cio)) + + f.end_of_data = address + datasz + seek(io, f.end_of_data) + + offset = h5offset(f, header_offset) + !isempty(dset.name) && (dset.parent[dset.name] = offset) + #dset.offset = offset + + # load current dataset as new dataset + ddset = get_dataset(f, offset, dset.parent, dset.name) + for field in fieldnames(Dataset) + setproperty!(dset, field, getfield(ddset, field)) + end + return offset +end end \ No newline at end of file diff --git a/test/mmap_test.jl b/test/mmap_test.jl new file mode 100644 index 00000000..ac8f85b4 --- /dev/null +++ b/test/mmap_test.jl @@ -0,0 +1,88 @@ +using JLD2, Test + +@testset "Mmapped Arrays" begin + cd(mktempdir()) do + + a = rand(100,100); + b = rand(ComplexF64, 5,5) + c = 42 + d = [ntuple(x->Bool(x%2), Val(24)) for i=1:100] + + fn = "test.jld2" + jldsave(fn; a, b, c, d) + + jldopen(fn, "r") do f + dset = JLD2.get_dataset(f, "a") + @test JLD2.ismmappable(dset) + @test JLD2.readmmap(dset) == a + dset = JLD2.get_dataset(f, "b") + @test JLD2.ismmappable(dset) + @test JLD2.readmmap(dset) == b + dset = JLD2.get_dataset(f, "c") + @test JLD2.ismmappable(dset) == false + dset = JLD2.get_dataset(f, "d") + @test JLD2.ismmappable(dset) == true + end + + if Sys.iswindows() + jldopen(fn, "a") do f + dset = JLD2.get_dataset(f, "a") + @test JLD2.ismmappable(dset) == false + @test_logs (:warn, "On Windows memory-mapping is only possible for files in read-only mode.") JLD2.ismmappable(dset) + dset = JLD2.get_dataset(f, "c") + @test JLD2.ismmappable(dset) == false + @test_nowarn JLD2.ismmappable(dset) + end + else + jldopen(fn, "a") do f + dset = JLD2.get_dataset(f, "a") + @test JLD2.ismmappable(dset) + @test JLD2.readmmap(dset) == a + JLD2.readmmap(dset)[1,1] = 42.0 + + dset = JLD2.get_dataset(f, "b") + @test JLD2.ismmappable(dset) + @test JLD2.readmmap(dset) == b + JLD2.readmmap(dset)[1,1] = 4.0 + 2.0im + + dset = JLD2.get_dataset(f, "c") + @test JLD2.ismmappable(dset) == false + + dset = JLD2.get_dataset(f, "d") + @test JLD2.ismmappable(dset) == true + end + + jldopen(fn, "r") do f + @test f["a"][1,1] == 42.0 + @test f["b"][1,1] == 4.0 + 2.0im + @test f["d"] == d + end + end + end +end + +if !Sys.iswindows() + @testset "Early Allocation" begin + # Update this for proper API eventually + jldopen(fn, "w") do f + dset = JLD2.create_dataset(f, "data") + + dset.datatype = JLD2.h5fieldtype(f, Float64, Float64, Val{false}) + + dims = (100,100) + dset.dataspace = JLD2.WriteDataspace(JLD2.DS_SIMPLE, UInt64.(reverse(dims)), ()) + + JLD2.allocate_early(dset, Float64) + + @test JLD2.ismmappable(dset) + + emptyarr = JLD2.readmmap(dset) + + emptyarr[1:2:100] .= 1:50 + end + + data = JLD2.load(fn, "data") + @test all(data[2:2:100] .== 0.0) + @test all(data[1:2:100] .== 1:50) + end +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 85d938bd..a6534548 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -31,4 +31,5 @@ include("customserialization.jl") include("compression.jl") include("test_files.jl") include("unpack_test.jl") -include("dataset_api.jl") \ No newline at end of file +include("dataset_api.jl") +include("mmap_test.jl") \ No newline at end of file From dd7c9b0bb5c0446954e2f38a349584d99d48ad99 Mon Sep 17 00:00:00 2001 From: JonasIsensee Date: Sat, 24 Aug 2024 14:11:10 +0200 Subject: [PATCH 04/11] downgrade testing (#547) * downgrade testing * bump compat for Requires * bump compat for FileIO * rm UUIDs * skip Mmap * remove test that only tests a FileIO feature --------- Co-authored-by: Jonas Isensee --- .github/workflows/Downgrade.yml | 28 ++++++++++++++++++++++++++++ Project.toml | 7 ++----- test/loadsave.jl | 7 ------- 3 files changed, 30 insertions(+), 12 deletions(-) create mode 100644 .github/workflows/Downgrade.yml diff --git a/.github/workflows/Downgrade.yml b/.github/workflows/Downgrade.yml new file mode 100644 index 00000000..8e66c1db --- /dev/null +++ b/.github/workflows/Downgrade.yml @@ -0,0 +1,28 @@ +name: Downgrade +on: + pull_request: + branches: + - master + paths-ignore: + - 'docs/**' + push: + branches: + - master + paths-ignore: + - 'docs/**' +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + version: ['1'] + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + - uses: cjdoris/julia-downgrade-compat-action@v1 + with: + skip: Pkg,TOML,Mmap + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 diff --git a/Project.toml b/Project.toml index f18418c0..3418b006 100644 --- a/Project.toml +++ b/Project.toml @@ -10,15 +10,12 @@ OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Requires = "ae029012-a4dd-5104-9daa-d747884805df" TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" -UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [compat] -FileIO = "1" +FileIO = "1.5" MacroTools = "0.5" -Mmap = "1" OrderedCollections = "1" PrecompileTools = "1" -Requires = "1" +Requires = "1.3" TranscodingStreams = "0.9, 0.10, 0.11" -UUIDs = "1" julia = "1.6" diff --git a/test/loadsave.jl b/test/loadsave.jl index 311eb753..e0d3a4f8 100644 --- a/test/loadsave.jl +++ b/test/loadsave.jl @@ -117,13 +117,6 @@ jldopen(fn, "r+") do f @test f["x2"] == x2 end -# Issue #19 -save(fn, Dict("a"=>[1,2,3])) -io = open(fn) -@info("The next error message (involving \"loading nothing\") is a sign of normal operation") -@test_throws FileIO.CapturedException load(io) -close(io) - # Issue #33 d = Dict("params/p1" => 1, "params/p2" => 2., From 140dc81b3c04a4b6c3c5e70f7014a6fe49b025b1 Mon Sep 17 00:00:00 2001 From: JonasIsensee Date: Sun, 25 Aug 2024 09:43:32 +0200 Subject: [PATCH 05/11] experimental disable commit (#544) * experimental disable commit * 1.6 compat and make disable_commit a field of the file --- src/JLD2.jl | 4 +++- src/data/writing_datatypes.jl | 9 +++++++++ test/loadsave.jl | 13 +++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/JLD2.jl b/src/JLD2.jl index b03c71e9..cefc4a9e 100644 --- a/src/JLD2.jl +++ b/src/JLD2.jl @@ -108,6 +108,8 @@ mutable struct JLDFile{T<:IO} compress#::Union{Bool,Symbol} mmaparrays::Bool n_times_opened::Int + # Experimental feature: disable committing structs + disable_commit::Bool datatype_locations::OrderedDict{RelOffset,CommittedDatatype} datatypes::Vector{H5Datatype} datatype_wsession::JLDWriteSession{Dict{UInt,RelOffset}} @@ -127,7 +129,7 @@ mutable struct JLDFile{T<:IO} function JLDFile{T}(io::IO, path::AbstractString, writable::Bool, written::Bool, compress,#::Union{Bool,Symbol}, mmaparrays::Bool) where T - f = new(io, path, writable, written, compress, mmaparrays, 1, + f = new(io, path, writable, written, compress, mmaparrays, 1, false, OrderedDict{RelOffset,CommittedDatatype}(), H5Datatype[], JLDWriteSession(), Dict{String,Any}(), IdDict(), IdDict(), Dict{RelOffset,WeakRef}(), DATA_START, Dict{RelOffset,GlobalHeap}(), diff --git a/src/data/writing_datatypes.jl b/src/data/writing_datatypes.jl index a080638b..bf8ba472 100644 --- a/src/data/writing_datatypes.jl +++ b/src/data/writing_datatypes.jl @@ -144,6 +144,9 @@ h5type(f::JLDFile, @nospecialize(x)) = h5type(f, writeas(typeof(x)), x) # Make a compound datatype from a set of names and types @nospecializeinfer function commit_compound(f::JLDFile, names::AbstractVector{Symbol}, @nospecialize(writtenas::DataType), @nospecialize(readas::Type)) + if f.disable_commit + throw(ArgumentError("Attempted to commit DataType $writtenas but committing is disabled.")) + end types = writtenas.types offsets = Int[] h5names = Symbol[] @@ -192,6 +195,9 @@ end @nospecialize(writeas::DataType), @nospecialize(readas::DataType), attributes::WrittenAttribute...) + if f.disable_commit + throw(ArgumentError("Attempted to commit DataType $readas but committing is disabled.")) + end io = f.io # This needs to be written this way or type inference gets unhappy... @@ -362,6 +368,9 @@ function h5fieldtype(f::JLDFile, ::Type{T}, readas::Type, ::Initialized) where T end @lookup_committed f DataType + if f.disable_commit + throw(ArgumentError("Attempted to commit DataType $readas but committing is disabled.")) + end io = f.io offset = f.end_of_data diff --git a/test/loadsave.jl b/test/loadsave.jl index e0d3a4f8..7dcb3afc 100644 --- a/test/loadsave.jl +++ b/test/loadsave.jl @@ -744,3 +744,16 @@ end end end +@testset "Disable committing datatypes" begin + cd(mktempdir()) do + jldopen("test.jld2", "w") do f + f.disable_commit = true + + @test_throws ArgumentError f["1"] = Dict(1=>2) + @test_throws ArgumentError f["2"] = Vector{Float64} + @test_throws ArgumentError f["3"] = (1,2,3) + # this could eventually be allowed + @test_throws ArgumentError f["4"] = (; a=1, b=2) + end + end +end From 07c04bec0e1e8460b69219489d1a837bd25e31a0 Mon Sep 17 00:00:00 2001 From: Jonas Isensee Date: Sun, 25 Aug 2024 13:31:24 +0200 Subject: [PATCH 06/11] some code cleanup --- src/JLD2.jl | 2 +- src/compression.jl | 25 ++++++++------------ src/datasets.jl | 27 ++++++++-------------- src/datatypes.jl | 9 ++++---- src/explicit_datasets.jl | 39 ++++++++++++++++---------------- src/groups.jl | 26 ++++++--------------- src/headermessages.jl | 30 +++++++++++++++++------- src/macros_utils.jl | 28 +++++++++++------------ src/object_headers.jl | 49 ++++++++++------------------------------ src/types.jl | 4 ++++ 10 files changed, 101 insertions(+), 138 deletions(-) diff --git a/src/JLD2.jl b/src/JLD2.jl index cefc4a9e..b0a19a27 100644 --- a/src/JLD2.jl +++ b/src/JLD2.jl @@ -482,8 +482,8 @@ printtoc(io::IO, f::JLDFile; numlines = typemax(Int64)) = -include("headermessages.jl") include("object_headers.jl") +include("headermessages.jl") include("groups.jl") include("dataspaces.jl") include("attributes.jl") diff --git a/src/compression.jl b/src/compression.jl index ad6a6f77..d3f93a1f 100644 --- a/src/compression.jl +++ b/src/compression.jl @@ -180,24 +180,17 @@ function write_chunked_storage_message( io::IO, elsize::Int, dims::NTuple{N,Int}, filtered_size::Int, - offset::RelOffset) where N - jlwrite(io, HeaderMessage(HmDataLayout, chunked_storage_message_size(N) - jlsizeof(HeaderMessage), 0)) - jlwrite(io, UInt8(4)) # Version - jlwrite(io, UInt8(LcChunked)) # Layout Class - jlwrite(io, UInt8(2)) # Flags (= SINGLE_INDEX_WITH_FILTER) - jlwrite(io, UInt8(N+1)) # Dimensionality - jlwrite(io, UInt8(jlsizeof(Length))) # Dimensionality Size - for i = N:-1:1 - jlwrite(io, Length(dims[i])) # Dimensions 1...N - end - jlwrite(io, Length(elsize)) # Element size (last dimension) - jlwrite(io, UInt8(1)) # Chunk Indexing Type (= Single Chunk) - jlwrite(io, Length(filtered_size)) # Size of filtered chunk - jlwrite(io, UInt32(0)) # Filters for chunk - jlwrite(io, offset) # Address + data_address::RelOffset) where N + write_header_message(io, Val(HmDataLayout); + layout_class = LcChunked, + flags = 2, # (= SINGLE_INDEX_WITH_FILTER) + dimensions = UInt64.((reverse(dims)..., elsize)), # Reversed dimensions with element size as last dim + chunk_indexing_type = 1, # (= Single Chunk) + data_size = filtered_size, + filters = 0, # Filters for chunk + data_address) end - function write_compressed_data(cio, f, data, odr, wsession, filter_id, compressor) write_filter_pipeline_message(cio, filter_id) diff --git a/src/datasets.jl b/src/datasets.jl index 2cdcab85..31d19df3 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -400,7 +400,7 @@ end if datasz != 0 write_data(cio, f, data, odr, datamode(odr), wsession) end - jlwrite(cio, CONTINUATION_PLACEHOLDER) + write_continuation_placeholder(cio) jlwrite(io, end_checksum(cio)) elseif layout_class == LcChunked @@ -412,7 +412,7 @@ end write_chunked_storage_message(cio, odr_sizeof(odr), size(data), length(deflated), h5offset(f, f.end_of_data)) # Add NIL message replacable by continuation message - jlwrite(cio, CONTINUATION_PLACEHOLDER) + write_continuation_placeholder(cio) jlwrite(f.io, end_checksum(cio)) seek(f.io, f.end_of_data) @@ -421,7 +421,7 @@ end else data_address = f.end_of_data + 8 - mod1(f.end_of_data, 8) jlwrite(cio, ContiguousStorageMessage(datasz, h5offset(f, data_address))) - jlwrite(cio, CONTINUATION_PLACEHOLDER) + write_continuation_placeholder(cio) jlwrite(io, end_checksum(cio)) f.end_of_data = data_address + datasz @@ -436,26 +436,17 @@ function write_object_header_and_dataspace_message(cio::IO, f::JLDFile, psz::Int jlwrite(cio, ObjectStart(size_flag(psz))) write_size(cio, psz) - # Fill value - jlwrite(cio, HeaderMessage(HmFillValue, 2, 0)) - jlwrite(cio, UInt8(3)) # Version - jlwrite(cio, 0x09) # Flags - - # Dataspace - jlwrite(cio, HeaderMessage(HmDataspace, jlsizeof(dataspace), 0)) - jlwrite(cio, dataspace) + write_header_message(cio, Val(HmFillValue); flags=0x09) + write_header_message(cio, Val(HmDataspace); dataspace.dataspace_type, dimensions=dataspace.size) # Attributes for attr in dataspace.attributes - jlwrite(cio, HeaderMessage(HmAttribute, jlsizeof(attr), 0)) - write_attribute(cio, f, attr, f.datatype_wsession) + write_header_message(cio, f, attr) end end -function write_datatype_message(cio::IO, datatype::H5Datatype) - jlwrite(cio, HeaderMessage(HmDatatype, jlsizeof(datatype), 1 | (2*isa(datatype, CommittedDatatype)))) - jlwrite(cio, datatype) -end +write_datatype_message(cio::IO, dt::H5Datatype) = + write_header_message(cio, Val(HmDatatype), 1 | (2*isa(dt, CommittedDatatype)); dt) @nospecializeinfer function write_dataset(f::JLDFile, @nospecialize(x), wsession::JLDWriteSession)::RelOffset @@ -523,7 +514,7 @@ function delete_written_link!(f::JLDFile, roffset::RelOffset, name::AbstractStri if msg.type == HmLinkMessage && HmWrap(HmLinkMessage, msg).link_name == name # delete link seek(f.io, fileoffset(f, msg.offset)) - jlwrite(f.io, HeaderMessage(HmNil, msg.size, 0)) + write_header_message(f.io, Val(HmNil), 0, msg.size) update_checksum(f.io, iter.chunk.chunk_start, iter.chunk.chunk_end) end end diff --git a/src/datatypes.jl b/src/datatypes.jl index 88cf1fd0..202e65d2 100644 --- a/src/datatypes.jl +++ b/src/datatypes.jl @@ -312,7 +312,8 @@ end function commit(f::JLDFile, @nospecialize(dt::H5Datatype), attrs::Tuple{Vararg{WrittenAttribute}}=()) - psz = jlsizeof(HeaderMessage) * (length(attrs) + 1) + jlsizeof(dt) + psz = jlsizeof(Val(HmDatatype), 64; dt) + psz += jlsizeof(HeaderMessage) * (length(attrs)) for attr in attrs psz += jlsizeof(attr) end @@ -326,11 +327,9 @@ function commit(f::JLDFile, cio = begin_checksum_write(io, sz) jlwrite(cio, ObjectStart(size_flag(psz))) write_size(cio, psz) - jlwrite(cio, HeaderMessage(HmDatatype, jlsizeof(dt), 64)) - jlwrite(cio, dt) + write_header_message(cio, Val(HmDatatype), 64; dt) for attr in attrs - jlwrite(cio, HeaderMessage(HmAttribute, jlsizeof(attr), 0)) - write_attribute(cio, f, attr, f.datatype_wsession) + write_header_message(cio, f, attr) end jlwrite(io, end_checksum(cio)) end diff --git a/src/explicit_datasets.jl b/src/explicit_datasets.jl index 06f55f6c..2f5d39a7 100644 --- a/src/explicit_datasets.jl +++ b/src/explicit_datasets.jl @@ -177,7 +177,7 @@ function write_dataset(dataset::Dataset, data) write_object_header_and_dataspace_message(cio, f, psz, dataspace) write_datatype_message(cio, datatype) for a in attributes - write_message(cio, f, a, wsession) + write_header_message(cio, f, a, wsession) end # Data storage layout if layout_class == LcCompact @@ -187,7 +187,7 @@ function write_dataset(dataset::Dataset, data) end dataset.header_chunk_info = (header_offset, position(cio)+20, position(cio)) # Add NIL message replacable by continuation message - jlwrite(cio, CONTINUATION_PLACEHOLDER) + write_continuation_placeholder(cio) jlwrite(io, end_checksum(cio)) elseif layout_class == LcChunked # this thing is a bit weird @@ -205,7 +205,7 @@ function write_dataset(dataset::Dataset, data) dataset.header_chunk_info = (header_offset, position(cio)+20, position(cio)) # Add NIL message replacable by continuation message - jlwrite(cio, CONTINUATION_PLACEHOLDER) + write_continuation_placeholder(cio) jlwrite(f.io, end_checksum(cio)) else @@ -216,7 +216,7 @@ function write_dataset(dataset::Dataset, data) dataset.header_chunk_info = (header_offset, position(cio)+20, position(cio)) # Add NIL message replacable by continuation message - jlwrite(io, CONTINUATION_PLACEHOLDER) + write_continuation_placeholder(cio) jlwrite(io, end_checksum(cio)) f.end_of_data = address + datasz @@ -292,17 +292,16 @@ end # Attributes message_size(msg::WrittenAttribute) = jlsizeof(HeaderMessage) + jlsizeof(msg) -function write_message(io,f::JLDFile, msg::WrittenAttribute, wsession=JLDWriteSession()) +function write_header_message(io,f::JLDFile, msg::WrittenAttribute, wsession=JLDWriteSession()) jlwrite(io, HeaderMessage(HmAttribute, jlsizeof(msg), 0)) write_attribute(io, f, msg, wsession) return nothing end # Links -message_size(msg::Pair{String, RelOffset}) = jlsizeof(HeaderMessage) + link_size(msg.first) -write_message(io, f, msg::Pair{String, RelOffset}, _=nothing) = - jlwrite(io, Hmessage(HmLinkMessage; link_name = msg.first, target = msg.second)) - +message_size(msg::Pair{String, RelOffset}) = jlsizeof(Val(HmLinkMessage); link_name=msg.first) +write_header_message(io, f, msg::Pair{String, RelOffset}, _=nothing) = + write_header_message(io, Val(HmLinkMessage); link_name=msg.first, target=msg.second) function attach_message(f::JLDFile, offset, messages, wsession=JLDWriteSession(); chunk_start, @@ -325,7 +324,7 @@ function attach_message(f::JLDFile, offset, messages, wsession=JLDWriteSession() sz = message_size(msg) if remaining_space ≥ sz + 4 || remaining_space == sz pos = position(io) - write_message(io, f, msg) + write_header_message(io, f, msg) rsz = position(io) - pos if rsz != sz throw(InternalError("Message size mismatch. Expected $sz, got $rsz for message $msg")) @@ -345,10 +344,10 @@ function attach_message(f::JLDFile, offset, messages, wsession=JLDWriteSession() empty_space = chunk_end-position(io)-4 - 20 if empty_space != -4 empty_space < 0 && throw(InternalError("Negative empty space. This should not happen")) - write_message(io, f, Hmessage(HmNil, 0, empty_space)) + write_header_message(io, Val(HmNil), 0, empty_space) end # continuation space - write_message(io, f, Hmessage(HmNil, 0, 16)) + write_continuation_placeholder(io) # Re-calculate checksum update_checksum(io, chunk_start, chunk_end) @@ -357,7 +356,7 @@ function attach_message(f::JLDFile, offset, messages, wsession=JLDWriteSession() end if !iszero(remaining_space) # Mark remaining free space with a NIL message - write_message(io, f, Hmessage(HmNil, 0, remaining_space-4)) + write_header_message(io, Val(HmNil), 0, remaining_space-4) end # If we got to here then a new continuation needs to be created continuation_start = f.end_of_data @@ -369,9 +368,9 @@ function attach_message(f::JLDFile, offset, messages, wsession=JLDWriteSession() tmp - continuation_size > 4 && (continuation_size = tmp) # Object continuation message - jlwrite(io, Hmessage(HmObjectHeaderContinuation; + write_header_message(io, Val(HmObjectHeaderContinuation); continuation_offset=h5offset(f, continuation_start), - continuation_length=Length(continuation_size))) + continuation_length=Length(continuation_size)) # Re-calculate checksum update_checksum(io, chunk_start, chunk_end) @@ -386,16 +385,16 @@ function attach_message(f::JLDFile, offset, messages, wsession=JLDWriteSession() while !isempty(messages) msg = popfirst!(messages) sz = message_size(msg) - write_message(io, f, msg, wsession) + write_header_message(io, f, msg, wsession) next_msg_offset += sz remaining_space -= sz end if remaining_space > 0 @assert remaining_space ≥ 4 "Gaps smaller than 4 bytes should not occur" - jlwrite(cio, Hmessage(HmNil, 0, remaining_space)) + write_header_message(cio, Val(HmNil), 0, remaining_space) end # Extra space for object continuation - jlwrite(cio, CONTINUATION_PLACEHOLDER) + write_continuation_placeholder(cio) # Checksum jlwrite(io, end_checksum(cio)) f.end_of_data = position(io) @@ -518,7 +517,7 @@ function allocate_early(dset::Dataset, T::DataType) write_object_header_and_dataspace_message(cio, f, psz, dataspace) write_datatype_message(cio, datatype) for a in attributes - write_message(cio, f, a, wsession) + write_header_message(cio, f, a, wsession) end # Align contiguous chunk to 8 bytes in the file address = f.end_of_data + 8 - mod1(f.end_of_data, 8) @@ -527,7 +526,7 @@ function allocate_early(dset::Dataset, T::DataType) dset.header_chunk_info = (header_offset, position(cio)+20, position(cio)) # Add NIL message replacable by continuation message - jlwrite(io, CONTINUATION_PLACEHOLDER) + write_continuation_placeholder(cio) jlwrite(io, end_checksum(cio)) f.end_of_data = address + datasz diff --git a/src/groups.jl b/src/groups.jl index 72bad05e..11af3075 100644 --- a/src/groups.jl +++ b/src/groups.jl @@ -253,13 +253,6 @@ function load_group(f::JLDFile, offset::RelOffset) OrderedDict{String,RelOffset}(), OrderedDict{String,Group}(), links) end -""" - link_size(name::String) - -Returns the size of a link message, excluding message header. -""" -link_size(link_name::String) = sizefun(Val(HmLinkMessage), 0,0,(;link_name, target=UNDEFINED_ADDRESS)) - """ links_size(pairs) @@ -269,7 +262,7 @@ Returns the size of several link messages. `pairs` is an iterator of function links_size(pairs) sz = 0 for (name::String,) in pairs - sz += link_size(name) + jlsizeof(HeaderMessage) + sz += jlsizeof(Val(HmLinkMessage); link_name=name) end sz end @@ -305,30 +298,25 @@ function save_group(g::Group) # If the group has not been saved yet if g.last_chunk_start_offset == -1 - link_info = Hmessage(HmLinkInfo) - group_info = Hmessage(HmGroupInfo; g.est_num_entries, g.est_link_name_len) - - totalsize = jlsizeof(link_info) + jlsizeof(group_info) - # Object header continuation placeholder - totalsize += (jlsizeof(HeaderMessage) + jlsizeof(RelOffset) + jlsizeof(Length)) - # Link messages + totalsize = jlsizeof(Val(HmLinkInfo)) + totalsize += jlsizeof(Val(HmGroupInfo); g.est_num_entries, g.est_link_name_len) + totalsize += CONTINUATION_MSG_SIZE totalsize += links_size(g.unwritten_links) - # add to size to make space for additional links totalsize += group_extra_space(g) + sz = jlsizeof(ObjectStart) + size_size(totalsize) + totalsize g.last_chunk_start_offset = f.end_of_data g.last_chunk_checksum_offset = f.end_of_data + sz f.end_of_data += sz + 4 - seek(io, g.last_chunk_start_offset) # Object header jlwrite(io, ObjectStart(size_flag(totalsize))) write_size(io, totalsize) - jlwrite(io, link_info) - jlwrite(io, group_info) + write_header_message(io, Val(HmLinkInfo)) + write_header_message(io, Val(HmGroupInfo); g.est_num_entries, g.est_link_name_len) g.next_link_offset = position(io) end diff --git a/src/headermessages.jl b/src/headermessages.jl index 2109856b..8cc5ce09 100644 --- a/src/headermessages.jl +++ b/src/headermessages.jl @@ -6,7 +6,7 @@ @pseudostruct HmDataspace begin version::UInt8 = 2 dimensionality::UInt8 = length(kw.dimensions) - flags::UInt8 + flags::UInt8 = 0 (version == 2) && dataspace_type::UInt8 (version == 1) && dataspace_type::@computed(DS_V1) version == 1 && @skip(5) @@ -26,8 +26,8 @@ end @pseudostruct HmDatatype begin if isset(hflags,1) - version::UInt8 - msgtype::UInt8 + version::UInt8 = 3 + msgtype::UInt8 = 2 dt::SharedDatatype datatype_offset::@computed(dt.header_offset) end @@ -43,7 +43,21 @@ end end @pseudostruct HmFillValue begin - @skip(hsize) + version::UInt8 = 3 + if version == 1 || version == 2 + space_allocation_time::UInt8 + fill_value_write_time::UInt8 + fill_value_defined::UInt8 + if !(version > 1 && fill_value_defined==0) + size::UInt32 + fill_value::@Blob(size) + end + end + if version == 3 + flags::UInt8 + isset(flags, 5) && size::UInt32 + isset(flags, 5) && fill_value::@Blob(size) + end end @pseudostruct HmLinkMessage begin @@ -54,7 +68,7 @@ end isset(flags, 4) && (link_name_charset::UInt8 = CSET_UTF8) link_name_len::@Int(2^(flags%4)) = sizeof(kw.link_name) link_name::@FixedLengthString(link_name_len) # non-null-terminated - (!isset(flags, 3) || link_type==0) && target::RelOffset + (!isset(flags, 3) || link_type==0) && (target::RelOffset = UNDEFINED_ADDRESS) if isset(flags, 3) && link_type == 1 link_info_size::UInt16 soft_link::@Blob(link_info_size) # non-null terminated string @@ -75,7 +89,7 @@ end end @pseudostruct HmDataLayout begin - version::UInt8 + version::UInt8 = 4 if version in (1,2) dimensionality::UInt8 layout_class::LayoutClass @@ -108,8 +122,8 @@ end end if version == 4 && layout_class == LcChunked flags::UInt8 - dimensionality::UInt8 - dim_size::UInt8 + dimensionality::UInt8 = length(kw.dimensions) + dim_size::UInt8 = 8 # 8 bytes per dimension dimensions::NTuple{Int(dimensionality), uintofsize(dim_size)} chunk_indexing_type::UInt8 if chunk_indexing_type == 1 # Single Chunk diff --git a/src/macros_utils.jl b/src/macros_utils.jl index 0a9d7345..526a4e2f 100644 --- a/src/macros_utils.jl +++ b/src/macros_utils.jl @@ -50,15 +50,13 @@ macro pseudostruct(name, blck) constructor_body, size_body, messageshow_body = build_fun_body((Any[], Any[], Any[]), blck) - exprs = generate_getprop(blck.args) + get_prop_exprs = generate_getprop(blck.args) quote - function $(esc(:construct_hm_payload))(::Val{$name}, $(esc(:hflags)), $(esc(:hsize)), $(esc(:kw))) - io = IOBuffer() + function $(esc(:jlwrite))(io, ::Val{$name}, $(esc(:hflags)), $(esc(:hsize)), $(esc(:kw))) $(constructor_body...) - io end - function $(esc(:sizefun))(::Val{$name}, $(esc(:hflags)), $(esc(:hsize)), $(esc(:kw))) + function $(esc(:compute_size))(::Val{$name}, $(esc(:hflags)), $(esc(:hsize)), $(esc(:kw))) $(esc(:offset)) = 0 $(size_body...) return $(esc(:offset)) @@ -73,19 +71,21 @@ macro pseudostruct(name, blck) return keyvalue end - function $(esc(:ioexpr))(::Val{$name}) - return $(QuoteNode(exprs)) + function $(esc(:(Base.getproperty)))(tw::HmWrap{$name}, s::Symbol) + s in (:size, :hflags, :m) && return getfield(tw, s) + m = getfield(tw, :m) + hflags = getfield(tw, :hflags) + hsize = getfield(tw, :size) + io = getfield(m, :io) + $(get_prop_exprs) + throw(ArgumentError("property $s not found")) end nothing end end -function getprop end -function construct_hm_payload end -function sizefun end +function compute_size end function messageshow end -function ioexpr end - function build_fun_body(accs, blk) for ex in blk.args @@ -152,8 +152,8 @@ function linefun(ex) haskey_ = nothing elseif @capture(T, @read(type_, rsize_)) || @capture(T, @read(type_)) read_io = :(jlread($io, $(esc(type)))) - write_statement = :(jlwrite(_io, $(esc(s)))) - increment = isnothing(rsize) ? :(sizeof(typeof($(esc(s))))) : rsize + write_statement = :(jlwrite(io, $(esc(s)))) + increment = isnothing(rsize) ? :(jlsizeof($(esc(s)))) : rsize else T = esc(T) read_io = :(jlread($io, $T)) diff --git a/src/object_headers.jl b/src/object_headers.jl index 8f336c48..e4d07e2b 100644 --- a/src/object_headers.jl +++ b/src/object_headers.jl @@ -22,14 +22,20 @@ struct Hmessage{IO} m::Message{IO} end -function Hmessage(type::HeaderMessageType, hflags=0x00, size=0; kwargs...) +function write_header_message(io, vtype::Val{HMT}, hflags=0x00, size=0; kwargs...) where HMT kw = (; kwargs...) - size = sizefun(Val(type), hflags, size, kw) - payload = construct_hm_payload(Val(type), hflags, size, kw) - Hmessage(type, UInt16(size), UInt8(hflags), UNDEFINED_ADDRESS,UNDEFINED_ADDRESS, - Message(type, 0, UNDEFINED_ADDRESS, payload)) + size = compute_size(vtype, hflags, size, kw) + jlwrite(io, UInt8(HMT)) + jlwrite(io, UInt16(size)) + jlwrite(io, UInt8(hflags)) + jlwrite(io, vtype, hflags, size, kw) + nothing end +# Returns size of the actual message + 4 bytes for the type, size, and flags +jlsizeof(vtype::Val, hflags=0x00, size=0; kwargs...) = + compute_size(vtype, hflags, size, (; kwargs...)) + 4 + struct HmWrap{HM, IOT} m::Message{IOT} hflags::UInt8 @@ -40,35 +46,7 @@ struct HmWrap{HM, IOT} new{type,IOT}(m, 0x0, 0x0) end -for HM in instances(HeaderMessageType) - @eval function Base.getproperty(tw::HmWrap{$HM}, s::Symbol) - s in (:size, :hflags, :m) && return getfield(tw, s) - m = getfield(tw, :m) - hflags = getfield(tw, :hflags) - hsize = getfield(tw, :size) - io = getfield(m, :io) - $(ioexpr(Val(HM))) - throw(ArgumentError("property $s not found")) - end -end - -const CONTINUATION_PLACEHOLDER = Hmessage(HmNil, 0, 16) - -write_message(io, f::JLDFile, msg::Hmessage) = jlwrite(io, msg) - -function jlwrite(io, msg::Hmessage) - write(io, msg.type) - write(io, msg.size) - write(io, msg.hflags) - m = msg.m - mio = m.io - seek(mio, m.address) - for _ in 1:msg.size - write(io, jlread(mio, UInt8)) - end -end - -jlsizeof(msg::Hmessage) = jlsizeof(HeaderMessage) + msg.size +write_continuation_placeholder(io::IO) = write_header_message(io, Val(HmNil), 0, 16) function Base.show(io::IO, hm::Hmessage) println(io, @@ -131,7 +109,6 @@ function print_header_messages(f::JLDFile, offset::RelOffset) nothing end - function read_header_message(f, io, header_version, chunk_start, groupflags) msgpos = h5offset(f, position(io)) if header_version == 1 @@ -153,8 +130,6 @@ function read_header_message(f, io, header_version, chunk_start, groupflags) Message(msg.msg_type, payload_address, payload_offset, io)) end - - """ mutable struct HeaderMessageIterator{IO} HeaderMessageIterator(f::JLDFile, offset::RelOffset) diff --git a/src/types.jl b/src/types.jl index dfc8284a..28e5fb3a 100644 --- a/src/types.jl +++ b/src/types.jl @@ -13,6 +13,7 @@ const OBJECT_HEADER_CONTINUATION_SIGNATURE = htol(0x4b48434f) # "OCHK" LcChunked = 0x02 LcVirtual = 0x03 end +LayoutClass(lc::LayoutClass) = lc @enum(CharacterSet::UInt8, CSET_ASCII, @@ -163,6 +164,9 @@ struct CommittedDatatype <: H5Datatype index::Int end +# Allow dropping the index field +SharedDatatype(dt::CommittedDatatype) = SharedDatatype(dt.header_offset) + """ ReadRepresentation{T,ODR} From 847e226136e4f8eceeeade29e4a5089aab56d795 Mon Sep 17 00:00:00 2001 From: Jonas Isensee Date: Sun, 25 Aug 2024 13:49:11 +0200 Subject: [PATCH 07/11] get rid of StorageMessage structs --- src/datalayouts.jl | 28 ---------------------------- src/datasets.jl | 9 +++++---- src/explicit_datasets.jl | 19 ++++++++++--------- src/headermessages.jl | 6 +++--- 4 files changed, 18 insertions(+), 44 deletions(-) diff --git a/src/datalayouts.jl b/src/datalayouts.jl index 16964883..2f101ee5 100644 --- a/src/datalayouts.jl +++ b/src/datalayouts.jl @@ -1,31 +1,3 @@ -struct CompactStorageMessage - hm::HeaderMessage - version::UInt8 - layout_class::LayoutClass - data_size::UInt16 -end -define_packed(CompactStorageMessage) -CompactStorageMessage(datasz::Int) = - CompactStorageMessage( - HeaderMessage(HmDataLayout, jlsizeof(CompactStorageMessage) - jlsizeof(HeaderMessage) + datasz, 0), - 4, LcCompact, datasz - ) - -struct ContiguousStorageMessage - hm::HeaderMessage - version::UInt8 - layout_class::LayoutClass - address::RelOffset - data_size::Length -end -define_packed(ContiguousStorageMessage) -ContiguousStorageMessage(datasz::Int, offset::RelOffset) = - ContiguousStorageMessage( - HeaderMessage(HmDataLayout, jlsizeof(ContiguousStorageMessage) - jlsizeof(HeaderMessage), 0), - 4, LcContiguous, offset, datasz - ) - - ## Left over header message parsing that does not have a good place. struct DataLayout diff --git a/src/datasets.jl b/src/datasets.jl index 31d19df3..45c2acc9 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -365,7 +365,7 @@ end # Figure out the layout if datasz == 0 || (!(data isa Array) && datasz < 8192) layout_class = LcCompact - psz += jlsizeof(CompactStorageMessage) + datasz + psz += jlsizeof(Val(HmDataLayout); layout_class, data_size=datasz) elseif data isa Array && compress != false && isconcretetype(eltype(data)) && isbitstype(eltype(data)) # Only now figure out if the compression argument is valid invoke_again, filter_id, compressor = get_compressor(compress) @@ -376,7 +376,7 @@ end psz += chunked_storage_message_size(ndims(data)) + pipeline_message_size(filter_id::UInt16) else layout_class = LcContiguous - psz += jlsizeof(ContiguousStorageMessage) + psz += jlsizeof(Val(HmDataLayout); layout_class) end fullsz = jlsizeof(ObjectStart) + size_size(psz) + psz + 4 @@ -396,7 +396,7 @@ end # Data storage layout if layout_class == LcCompact - jlwrite(cio, CompactStorageMessage(datasz)) + write_header_message(cio, Val(HmDataLayout); layout_class, data_size=datasz) if datasz != 0 write_data(cio, f, data, odr, datamode(odr), wsession) end @@ -420,7 +420,8 @@ end jlwrite(f.io, deflated) else data_address = f.end_of_data + 8 - mod1(f.end_of_data, 8) - jlwrite(cio, ContiguousStorageMessage(datasz, h5offset(f, data_address))) + write_header_message(cio, Val(HmDataLayout); + layout_class, data_address=h5offset(f, data_address), data_size=datasz) write_continuation_placeholder(cio) jlwrite(io, end_checksum(cio)) diff --git a/src/explicit_datasets.jl b/src/explicit_datasets.jl index 2f5d39a7..4c502130 100644 --- a/src/explicit_datasets.jl +++ b/src/explicit_datasets.jl @@ -151,8 +151,7 @@ function write_dataset(dataset::Dataset, data) # DataLayout object is only available after the data is written if datasz == 0 || (!(data isa Array) && datasz < 8192) layout_class = LcCompact - psz += jlsizeof(CompactStorageMessage) + datasz - + psz += jlsizeof(Val(HmDataLayout); layout_class, data_size=datasz) elseif !isnothing(dataset.chunk) || !isempty(dataset.filters.filters) # Do some additional checks on the data here layout_class = LcChunked @@ -160,7 +159,7 @@ function write_dataset(dataset::Dataset, data) psz += chunked_storage_message_size(ndims(data)) + pipeline_message_size(filter_id::UInt16) else layout_class = LcContiguous - psz += jlsizeof(ContiguousStorageMessage) + psz += jlsizeof(Val(HmDataLayout); layout_class) end fullsz = jlsizeof(ObjectStart) + size_size(psz) + psz + 4 @@ -181,7 +180,7 @@ function write_dataset(dataset::Dataset, data) end # Data storage layout if layout_class == LcCompact - jlwrite(cio, CompactStorageMessage(datasz)) + write_header_message(cio, Val(HmDataLayout); layout_class, data_size=datasz) if datasz != 0 write_data(cio, f, data, odr, datamode(odr), wsession) end @@ -211,8 +210,9 @@ function write_dataset(dataset::Dataset, data) else # Align contiguous chunk to 8 bytes in the file address = f.end_of_data + 8 - mod1(f.end_of_data, 8) - offset = h5offset(f, address) - jlwrite(cio, ContiguousStorageMessage(datasz, offset)) + data_address = h5offset(f, address) + write_header_message(cio, Val(HmDataLayout); + layout_class, data_address, data_size=datasz) dataset.header_chunk_info = (header_offset, position(cio)+20, position(cio)) # Add NIL message replacable by continuation message @@ -505,7 +505,7 @@ function allocate_early(dset::Dataset, T::DataType) # Layout class: Use contiguous for now layout_class = LcContiguous - psz += jlsizeof(ContiguousStorageMessage) + psz += jlsizeof(Val(HmDataLayout); layout_class) fullsz = jlsizeof(ObjectStart) + size_size(psz) + psz + 4 header_offset = f.end_of_data @@ -521,8 +521,9 @@ function allocate_early(dset::Dataset, T::DataType) end # Align contiguous chunk to 8 bytes in the file address = f.end_of_data + 8 - mod1(f.end_of_data, 8) - offset = h5offset(f, address) - jlwrite(cio, ContiguousStorageMessage(datasz, offset)) + data_address = h5offset(f, address) + write_header_message(cio, Val(HmDataLayout); + layout_class, data_address, data_size=datasz) dset.header_chunk_info = (header_offset, position(cio)+20, position(cio)) # Add NIL message replacable by continuation message diff --git a/src/headermessages.jl b/src/headermessages.jl index 8cc5ce09..f2270a9f 100644 --- a/src/headermessages.jl +++ b/src/headermessages.jl @@ -108,11 +108,11 @@ end if layout_class == LcCompact data_size::UInt16 data_address::@Offset - data::@Blob(data_size) + data::@Blob(data_size) = UInt8[] # don't write anything if nothing is passed end if layout_class == LcContiguous - data_address::RelOffset - data_size::Int64# Lengths + data_address::RelOffset = UNDEFINED_ADDRESS + data_size::Int64 = 0# Lengths end if version == 3 && layout_class == LcChunked dimensionality::UInt8 From 4778a0018483d2e77460e8d28ef4ccc28efb81b8 Mon Sep 17 00:00:00 2001 From: Jonas Isensee Date: Sun, 25 Aug 2024 14:18:22 +0200 Subject: [PATCH 08/11] chunked storage fix --- src/committed_datatype_introspection.jl | 1 - src/compression.jl | 22 +++++++++------------- src/datasets.jl | 9 +-------- src/explicit_datasets.jl | 18 +++++++++--------- test/dataset_api.jl | 8 +++++++- 5 files changed, 26 insertions(+), 32 deletions(-) diff --git a/src/committed_datatype_introspection.jl b/src/committed_datatype_introspection.jl index 92ef73ee..b3d6ea96 100644 --- a/src/committed_datatype_introspection.jl +++ b/src/committed_datatype_introspection.jl @@ -56,7 +56,6 @@ function stringify_object(f, offset) dataspace = ReadDataspace() attrs = EMPTY_READ_ATTRIBUTES datatype::H5Datatype = PlaceholderH5Datatype() - chunked_storage::Bool = false layout::DataLayout = DataLayout(0,LcCompact,0,-1) filter_pipeline::FilterPipeline = FilterPipeline(Filter[]) for msg in HeaderMessageIterator(f, offset) diff --git a/src/compression.jl b/src/compression.jl index d3f93a1f..211e0514 100644 --- a/src/compression.jl +++ b/src/compression.jl @@ -106,6 +106,15 @@ function get_compressor(::Bool) false, COMPRESSOR_TO_ID[:ZlibCompressor], m.ZlibCompressor() end +function get_compressor(filter_id::UInt16) + modname, compressorname, decompressorname, = ID_TO_DECOMPRESSOR[filter_id] + invoke_again, m = checked_import(modname) + if invoke_again || !applicable(getproperty(m,compressorname)) + _, compressor = Base.invokelatest(get_compressor, filter_id) + return true, compressor + end + return invoke_again, getproperty(m,compressorname)() +end function get_decompressor(filter_id::UInt16) modname, compressorname, decompressorname, = ID_TO_DECOMPRESSOR[filter_id] invoke_again, m = checked_import(modname) @@ -191,19 +200,6 @@ function write_chunked_storage_message( io::IO, data_address) end -function write_compressed_data(cio, f, data, odr, wsession, filter_id, compressor) - write_filter_pipeline_message(cio, filter_id) - - # deflate first - deflated = deflate_data(f, data, odr, wsession, compressor) - - write_chunked_storage_message(cio, odr_sizeof(odr), size(data), length(deflated), h5offset(f, f.end_of_data)) - jlwrite(f.io, end_checksum(cio)) - - f.end_of_data += length(deflated) - jlwrite(f.io, deflated) -end - function decompress!(inptr::Ptr, data_length, element_size, n, decompressor::TranscodingStreams.Codec) TranscodingStreams.initialize(decompressor) data = transcode(decompressor, unsafe_wrap(Array, Ptr{UInt8}(inptr), data_length))::Array{UInt8, 1} diff --git a/src/datasets.jl b/src/datasets.jl index 45c2acc9..69e9f379 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -403,15 +403,11 @@ end write_continuation_placeholder(cio) jlwrite(io, end_checksum(cio)) elseif layout_class == LcChunked - write_filter_pipeline_message(cio, filter_id) - # deflate first deflated = deflate_data(f, data, odr, wsession, compressor) - write_chunked_storage_message(cio, odr_sizeof(odr), size(data), length(deflated), h5offset(f, f.end_of_data)) - - # Add NIL message replacable by continuation message + write_chunked_storage_message(cio, odr_sizeof(odr), size(data), length(deflated), h5offset(f, f.end_of_data)) write_continuation_placeholder(cio) jlwrite(f.io, end_checksum(cio)) @@ -436,11 +432,8 @@ end function write_object_header_and_dataspace_message(cio::IO, f::JLDFile, psz::Int, dataspace::WriteDataspace) jlwrite(cio, ObjectStart(size_flag(psz))) write_size(cio, psz) - write_header_message(cio, Val(HmFillValue); flags=0x09) write_header_message(cio, Val(HmDataspace); dataspace.dataspace_type, dimensions=dataspace.size) - - # Attributes for attr in dataspace.attributes write_header_message(cio, f, attr) end diff --git a/src/explicit_datasets.jl b/src/explicit_datasets.jl index 4c502130..02c82409 100644 --- a/src/explicit_datasets.jl +++ b/src/explicit_datasets.jl @@ -153,6 +153,11 @@ function write_dataset(dataset::Dataset, data) layout_class = LcCompact psz += jlsizeof(Val(HmDataLayout); layout_class, data_size=datasz) elseif !isnothing(dataset.chunk) || !isempty(dataset.filters.filters) + filter_id = dataset.filters.filters[1].id + invoke_again, compressor = get_compressor(filter_id) + if invoke_again + return Base.invokelatest(write_dataset, dset, data)::RelOffset + end # Do some additional checks on the data here layout_class = LcChunked # improve filter support here @@ -189,24 +194,19 @@ function write_dataset(dataset::Dataset, data) write_continuation_placeholder(cio) jlwrite(io, end_checksum(cio)) elseif layout_class == LcChunked - # this thing is a bit weird - write_compressed_data(cio, f, data, odr, wsession, filter_id, compressor) write_filter_pipeline_message(cio, filter_id) # deflate first deflated = deflate_data(f, data, odr, wsession, compressor) - seek(f.io, h5offset(f, f.end_of_data)) - f.end_of_data += length(deflated) - jlwrite(f.io, deflated) - write_chunked_storage_message(cio, odr_sizeof(odr), size(data), length(deflated), h5offset(f, f.end_of_data)) - dataset.header_chunk_info = (header_offset, position(cio)+20, position(cio)) - # Add NIL message replacable by continuation message write_continuation_placeholder(cio) jlwrite(f.io, end_checksum(cio)) - + + seek(f.io, f.end_of_data) + f.end_of_data += length(deflated) + jlwrite(f.io, deflated) else # Align contiguous chunk to 8 bytes in the file address = f.end_of_data + 8 - mod1(f.end_of_data, 8) diff --git a/test/dataset_api.jl b/test/dataset_api.jl index 836c9819..b82a9b65 100644 --- a/test/dataset_api.jl +++ b/test/dataset_api.jl @@ -23,6 +23,12 @@ using JLD2, Test # Check that double attributes are not allowed @test_throws ArgumentError JLD2.add_attribute(dset, "addition", "A very different description.") end - end + jldopen(fn, "w") do f + dset = JLD2.create_dataset(f, "d") + dset.filters = JLD2.FilterPipeline([JLD2.Filter(1, 0, "", [])]) + JLD2.write_dataset(dset, zeros(1000,1000)) + end + @test load(fn)["d"] == zeros(1000,1000) + end end \ No newline at end of file From 9beaede96febdc09aeae39459ccb11cb4c35315c Mon Sep 17 00:00:00 2001 From: Jonas Isensee Date: Sun, 25 Aug 2024 15:45:57 +0200 Subject: [PATCH 09/11] clean up filterpipeline --- src/datalayouts.jl | 56 +++++++++++++--------------------------------- 1 file changed, 15 insertions(+), 41 deletions(-) diff --git a/src/datalayouts.jl b/src/datalayouts.jl index 2f101ee5..50db3fc6 100644 --- a/src/datalayouts.jl +++ b/src/datalayouts.jl @@ -59,47 +59,21 @@ function FilterPipeline(msg_::Hmessage) nfilters = msg.nfilters io = msg.m.io seek(io, msg.m.address+2) - if version == 1 - skip(io, 6) - filters = map(1:nfilters) do _ - id = jlread(io, UInt16) - name_length = jlread(io, UInt16) - flags = jlread(io, UInt16) - nclient_vals = jlread(io, UInt16) - if iszero(name_length) - name = "" - else - name = read_bytestring(io) - skip(io, 8-mod1(sizeof(name), 8)-1) - end - client_data = jlread(io, UInt32, nclient_vals) - isodd(nclient_vals) && skip(io, 4) - Filter(id, flags, name, client_data) - end - return FilterPipeline(filters) - elseif version == 2 - filters = map(1:nfilters) do _ - id = jlread(io, UInt16) - if id > 255 - name_length = jlread(io, UInt16) - flags = jlread(io, UInt16) - nclient_vals = jlread(io, UInt16) - if iszero(name_length) - name = "" - else - name = read_bytestring(io) - skip(io, 8-mod1(sizeof(name), 8)-1) - end - else - name = "" - flags = jlread(io, UInt16) - nclient_vals = jlread(io, UInt16) - end - client_data = jlread(io, UInt32, nclient_vals) - Filter(id, flags, name, client_data) + version == 1 && skip(io, 6) + filters = map(1:nfilters) do _ + id = jlread(io, UInt16) + name_length = (version == 2 && id < 255) ? zero(UInt16) : jlread(io, UInt16) + flags = jlread(io, UInt16) + nclient_vals = jlread(io, UInt16) + if iszero(name_length) + name = "" + else + name = read_bytestring(io) + skip(io, 8-mod1(sizeof(name), 8)-1) end - return FilterPipeline(filters) - else - throw(UnsupportedVersionException("Filter Pipeline Message version $version is not implemented")) + client_data = jlread(io, UInt32, nclient_vals) + (version == 1 && isodd(nclient_vals)) && skip(io, 4) + Filter(id, flags, name, client_data) end + return FilterPipeline(filters) end \ No newline at end of file From 3c8884fad2b7681908dfe0242a392edffcc4be17 Mon Sep 17 00:00:00 2001 From: JonasIsensee Date: Sun, 25 Aug 2024 16:43:08 +0200 Subject: [PATCH 10/11] Draft: experimental plain reconstruction (#522) * experimental plain reconstruction * Upgrade * add test --- src/JLD2.jl | 12 ++++++++---- src/data/reconstructing_datatypes.jl | 8 ++++++++ test/test_files.jl | 12 ++++++++++++ 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/JLD2.jl b/src/JLD2.jl index b0a19a27..b76b1780 100644 --- a/src/JLD2.jl +++ b/src/JLD2.jl @@ -105,6 +105,7 @@ mutable struct JLDFile{T<:IO} path::String writable::Bool written::Bool + plain::Bool compress#::Union{Bool,Symbol} mmaparrays::Bool n_times_opened::Int @@ -125,11 +126,13 @@ mutable struct JLDFile{T<:IO} root_group::Group{JLDFile{T}} types_group::Group{JLDFile{T}} base_address::UInt64 + function JLDFile{T}(io::IO, path::AbstractString, writable::Bool, written::Bool, + plain::Bool, compress,#::Union{Bool,Symbol}, mmaparrays::Bool) where T - f = new(io, path, writable, written, compress, mmaparrays, 1, false, + f = new(io, path, writable, written, plain, compress, mmaparrays, 1, false, OrderedDict{RelOffset,CommittedDatatype}(), H5Datatype[], JLDWriteSession(), Dict{String,Any}(), IdDict(), IdDict(), Dict{RelOffset,WeakRef}(), DATA_START, Dict{RelOffset,GlobalHeap}(), @@ -138,8 +141,8 @@ mutable struct JLDFile{T<:IO} f end end -JLDFile(io::IO, path::AbstractString, writable::Bool, written::Bool, compress, mmaparrays::Bool) = - JLDFile{typeof(io)}(io, path, writable, written, compress, mmaparrays) +JLDFile(io::IO, path::AbstractString, writable::Bool, written::Bool, plain::Bool, compress, mmaparrays::Bool) = + JLDFile{typeof(io)}(io, path, writable, written, plain, compress, mmaparrays) """ fileoffset(f::JLDFile, x::RelOffset) @@ -189,6 +192,7 @@ function jldopen(fname::AbstractString, wr::Bool, create::Bool, truncate::Bool, mmaparrays::Bool=false, typemap::Dict{String}=Dict{String,Any}(), parallel_read::Bool=false, + plain::Bool=false ) where T<:Union{Type{IOStream},Type{MmapIO}} mmaparrays && @warn "mmaparrays keyword is currently ignored" maxlog=1 verify_compressor(compress) @@ -240,7 +244,7 @@ function jldopen(fname::AbstractString, wr::Bool, create::Bool, truncate::Bool, io = openfile(iotype, fname, wr, create, truncate, fallback) created = !exists || truncate rname = realpath(fname) - f = JLDFile(io, rname, wr, created, compress, mmaparrays) + f = JLDFile(io, rname, wr, created, plain, compress, mmaparrays) if !parallel_read OPEN_FILES[rname] = WeakRef(f) diff --git a/src/data/reconstructing_datatypes.jl b/src/data/reconstructing_datatypes.jl index 3eb75960..ca40bc7c 100644 --- a/src/data/reconstructing_datatypes.jl +++ b/src/data/reconstructing_datatypes.jl @@ -98,6 +98,11 @@ function jltype(f::JLDFile, cdt::CommittedDatatype) end datatype = read_attr_data(f, julia_type_attr) + if f.plain && !(datatype isa Upgrade) && !(datatype <: Tuple) + rr = jltype(f, dt) + return f.h5jltype[cdt] = rr + end + if written_type_attr !== nothing # Custom serialization custom_datatype = read_attr_data(f, written_type_attr) @@ -415,6 +420,9 @@ function jlconvert(rr::ReadRepresentation{T,DataTypeODR()}, isunknowntype(m) && return m unknown_params && return UnknownType{m, Tuple{params...}} if hasparams + if f.plain && !(m === Tuple) + return Any + end try m = m{params...} catch e diff --git a/test/test_files.jl b/test/test_files.jl index e4185ebd..dd0802e7 100644 --- a/test/test_files.jl +++ b/test/test_files.jl @@ -257,4 +257,16 @@ end @test getfoo("readas_foo_n_sin.jld2") isa Readas.FooN{typeof(sin)} @test getfoo("readas_foo_a.jld2") isa Readas.Foo{Readas.UndefinedFunction} @test getfoo("readas_foo_n_a.jld2") isa Readas.FooNSerialization +end + + +@testset "plain reconstruction" begin + fn = joinpath(testfiles,"struct_reconstruction.jld2") + data = load(fn; plain=true) + # This is somewhat broken: Tuples are committed with field names "1", "2",... + # these are valid names but break most of the common API incl. the @NamedTuple macro + #@test data["tms"] == @NamedTuple{1::Int64, 2}((1, (a = 1,))) + @test getproperty(data["tms"], Symbol(1)) == 1 + @test data["s"] == (a = 1,) + @test data["ds"].kvvec[1] == (; first = "a", second = (a = 1,)) end \ No newline at end of file From 5ec40d2ec0d39e235a5a13fabc848926393538e9 Mon Sep 17 00:00:00 2001 From: Jonas Isensee Date: Mon, 26 Aug 2024 09:08:41 +0200 Subject: [PATCH 11/11] docstrings changelog & version bump --- CHANGELOG.md | 8 +++++++ Project.toml | 2 +- src/explicit_datasets.jl | 50 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 52e7e877..8de7a966 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.4.52 + - fix attribute loading + - new features: `readmmap` `ismmappable` and `allocate_early` (api experimental) + - adds Downgrade testing + - new feature: disable committing datatypes. (restrict to h5 numbers, strings, and arrays) + - internal cleanup + - new experimental feature: reconstruct all committed types as `NamedTuple`s + ## 0.4.51 - remove Unicode normalization support due to excessive performance loss - rework of header message internals diff --git a/Project.toml b/Project.toml index 3418b006..62709231 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "JLD2" uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819" -version = "0.4.51" +version = "0.4.52" [deps] FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" diff --git a/src/explicit_datasets.jl b/src/explicit_datasets.jl index 02c82409..f4d782ab 100644 --- a/src/explicit_datasets.jl +++ b/src/explicit_datasets.jl @@ -113,6 +113,11 @@ function Base.show(io::IO, ::MIME"text/plain", dset::Dataset) println(io, "└─") end +""" + write_dataset(dataset::Dataset, data) + +Write data to file using metadata prepared in the `dataset`. +""" function write_dataset(dataset::Dataset, data) f = dataset.parent.f if dataset.offset != UNDEFINED_ADDRESS @@ -229,6 +234,11 @@ function write_dataset(dataset::Dataset, data) return offset end +""" + read_dataset(dset::Dataset) + +Read the data referenced by a dataset. +""" function read_dataset(dset::Dataset) f = dset.parent.f read_data(f, @@ -240,6 +250,12 @@ function read_dataset(dset::Dataset) collect(values(dset.attributes))) end +""" + get_dataset(parent::Union{JLDFile, Group}, name::String) + +Get a stored dataset from a file by name or path as a `Dataset` object. +This may be useful for inspecting the metadata incl. types of a dataset. +""" get_dataset(f::JLDFile, args...; kwargs...) = get_dataset(f.root_group, args...; kwargs...) @@ -427,6 +443,12 @@ function add_attribute(dset::Dataset, name::String, data, wsession=JLDWriteSessi end end +""" + attributes(dset::Dataset; plain::Bool=false) + +Return the attributes of a dataset as an `OrderedDict`. +If `plain` is set to `true` then the values are returned as stored in the dataset object. +""" function attributes(dset::Dataset; plain::Bool=false) plain && return dset.attributes OrderedDict(keys(dset.attributes) .=> map(values(dset.attributes)) do attr @@ -434,7 +456,18 @@ function attributes(dset::Dataset; plain::Bool=false) end) end -## Mmap Arrays +""" + ismmappable(dset::Dataset) + +Check if a dataset can be memory-mapped. This can be useful for large arrays and for editing written arrays. + +An Array dataset may be mmapped if: + - `JLD2.samelayout(T) == true`: The element type is `isbits` and has a size that is a multiple of 8 bytes. + - Uncompressed: Compressed arrays cannot be memory-mapped + - Uses a contiguous layout: This is true for all array datasets written by JLD2 with version ≥ v0.4.52 + - Offset in file is a multiple of 8 bytes: This is a requirement for Mmap. + - Windows: The file must be opened in read-only mode. This is a limitation of Mmap on Windows. +""" function ismmappable(dset::Dataset) iswritten(dset) || return false f = dset.parent.f @@ -458,6 +491,13 @@ function ismmappable(dset::Dataset) return ret end + +""" + readmmap(dset::Dataset) + +Memory-map a dataset. This can be useful for large arrays and for editing written arrays. +See [`ismmappable`](@ref) for requirements. +""" function readmmap(dset::Dataset) ismmappable(dset) || throw(ArgumentError("Dataset is not mmappable")) f = dset.parent.f @@ -481,6 +521,14 @@ function readmmap(dset::Dataset) end @static if !Sys.iswindows() +""" + allocate_early(dset::Dataset, T::DataType) + +Write a dataset to file without any actual data. Reserve space according to element type and dimensions. +This may be useful in conjunction with [`readmmap`](@ref). + +Note: Not available on Windows. +""" function allocate_early(dset::Dataset, T::DataType) iswritten(dset) && throw(ArgumentError("Dataset has already been written to file")) # for this to work, require all information to be provided