diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 700707ce..03e0bffb 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,3 +5,4 @@ updates: directory: "/" # Location of package manifests schedule: interval: "weekly" + target-branch: "dev" diff --git a/.github/workflows/Docs.yml b/.github/workflows/Docs.yml new file mode 100644 index 00000000..37fee9e8 --- /dev/null +++ b/.github/workflows/Docs.yml @@ -0,0 +1,21 @@ +name: Documenter +on: + push: + branches: [main, master, dev] + tags: [v*] + pull_request: +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true +jobs: + Documenter: + permissions: + contents: write + statuses: write + name: Documentation + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/julia-docdeploy@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/Downgrade.yml b/.github/workflows/Downgrade.yml index 8e66c1db..06dc278e 100644 --- a/.github/workflows/Downgrade.yml +++ b/.github/workflows/Downgrade.yml @@ -1,15 +1,16 @@ name: Downgrade on: pull_request: - branches: - - master + branches: [main, master, dev] paths-ignore: - 'docs/**' push: - branches: - - master + branches: [main, master, dev] paths-ignore: - 'docs/**' +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true jobs: test: runs-on: ubuntu-latest @@ -18,11 +19,11 @@ jobs: version: ['1'] steps: - uses: actions/checkout@v4 - - uses: julia-actions/setup-julia@v1 + - uses: julia-actions/setup-julia@v2 with: version: ${{ matrix.version }} - - uses: cjdoris/julia-downgrade-compat-action@v1 + - uses: julia-actions/julia-downgrade-compat@v1 with: - skip: Pkg,TOML,Mmap + skip: Pkg, TOML, Mmap - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 diff --git a/.github/workflows/Invalidations.yml b/.github/workflows/Invalidations.yml index 66c86a36..decf2561 100644 --- a/.github/workflows/Invalidations.yml +++ b/.github/workflows/Invalidations.yml @@ -2,6 +2,7 @@ name: Invalidations on: pull_request: + branches: [main, master, dev] concurrency: # Skip intermediate builds: always. @@ -11,9 +12,6 @@ concurrency: jobs: evaluate: - # Only run on PRs to the default branch. - # In the PR trigger above branches can be specified only explicitly whereas this check should work for master, main, or any other default branch - if: github.base_ref == github.event.repository.default_branch runs-on: ubuntu-latest steps: - uses: julia-actions/setup-julia@v2 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d31f815b..7383441c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,14 +1,13 @@ name: CI on: pull_request: - branches: - - master - - dev + branches: [main, master, dev] push: - branches: - - master - - dev + branches: [main, master, dev] tags: '*' +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true jobs: test: name: Tests, Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} @@ -16,25 +15,14 @@ jobs: strategy: fail-fast: false matrix: - version: - - '1.6' - - '1' - - 'nightly' - os: [ubuntu-latest, windows-latest, macos-latest] # adjust according to need, e.g. os: [ubuntu-latest] if testing only on linux - arch: - - x64 + version: ['min', '1', 'nightly'] + os: [ubuntu-latest, windows-latest, macos-latest] + arch: [x64] include: - os: ubuntu-latest version: '1' arch: x86 steps: - # Cancel ongoing CI test runs if pushing to branch again before the previous tests - # have finished - - name: Cancel ongoing test runs for previous commits - uses: styfle/cancel-workflow-action@0.12.1 - with: - access_token: ${{ github.token }} - # Do tests - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@v2 @@ -49,31 +37,3 @@ jobs: with: token: ${{ secrets.CODECOV_TOKEN }} file: lcov.info - docs: - name: Documentation - runs-on: ubuntu-latest - steps: - # Cancel ongoing documentation build if pushing to branch again before the previous - # build is finished. - - name: Cancel ongoing documentation builds for previous commits - uses: styfle/cancel-workflow-action@0.12.1 - with: - access_token: ${{ github.token }} - - # Build docs - - uses: actions/checkout@v4 - - uses: julia-actions/setup-julia@v2 - with: - version: '1' - - name: Instantiate and install dependencies - run: | - julia --project=docs -e ' - using Pkg - Pkg.develop(PackageSpec(path=pwd())) - Pkg.instantiate()' - - name: Generate documentation and deploy - env: # needed for pushing to gh-pages branch - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key - run: - julia --project=docs docs/make.jl diff --git a/.github/workflows/stale_preview_removal.yml b/.github/workflows/stale_preview_removal.yml new file mode 100644 index 00000000..67fa01bf --- /dev/null +++ b/.github/workflows/stale_preview_removal.yml @@ -0,0 +1,71 @@ +name: Doc Preview Cleanup + +on: + schedule: + - cron: "0 0 * * *" + +jobs: + doc-preview-cleanup: + runs-on: ubuntu-latest + steps: + - name: Checkout gh-pages branch + uses: actions/checkout@v4 + with: + ref: gh-pages + - uses: julia-actions/setup-julia@v2 + - name: Check for stale PR previews + shell: julia {0} + run: | + using Pkg + pkg"activate --temp" + pkg"add HTTP JSON3" + + using HTTP + using JSON3 + using Dates + + repo = ENV["GITHUB_REPOSITORY"] + retention_days = 14 + + pr_previews = map(filter(startswith("PR"), readdir("previews"))) do dir + parse(Int, match(r"PR(\d*)", dir)[1]) + end + + function all_prs() + query_prs(page) = JSON3.read(HTTP.get("https://api.github.com/repos/$repo/pulls?per_page=100;page=$(page)").body) + prs = [] + page = 1 + while true + page_prs = query_prs(page) + isempty(page_prs) && break + append!(prs, page_prs) + page += 1 + end + return prs + end + prs = all_prs() + open_within_threshold = map(x -> x.number, filter(prs) do pr + time = DateTime(pr.updated_at[1:19], ISODateTimeFormat) + return pr.state == "open" && Dates.days(now() - time) <= retention_days + end) + + stale_previews = setdiff(pr_previews, open_within_threshold) + @info "Found $(length(stale_previews)) stale previews" + + if isempty(stale_previews) + @info "No stale previews" + exit(1) + end + + for pr in stale_previews + path = joinpath("previews", "PR$pr") + @info "Removing $path" + run(`git rm -rf $path`) + end + - name: Push changes + run: | + git config user.name "Documenter.jl" + git config user.email "documenter@juliadocs.github.io" + git commit -m "delete preview" + git branch gh-pages-new $(echo "delete history" | git commit-tree HEAD^{tree}) + git push --force origin gh-pages-new:gh-pages \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 8de7a966..1ebd5eb2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.4.53 + - Experimental: Slicing and inplace updating of array datasets + - updated CI workflows + - improve pretty printing of attribute header message + - fix storing of datatype info for h5 compat + ## 0.4.52 - fix attribute loading - new features: `readmmap` `ismmappable` and `allocate_early` (api experimental) diff --git a/Project.toml b/Project.toml index 62709231..981b946d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "JLD2" uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819" -version = "0.4.52" +version = "0.4.53" [deps] FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" diff --git a/src/data/number_types.jl b/src/data/number_types.jl index af176bb3..f4a3f858 100644 --- a/src/data/number_types.jl +++ b/src/data/number_types.jl @@ -67,18 +67,18 @@ h5fieldtype(::JLDFile, ::Type{Bool}, ::Type{Bool}, ::Initialized) =BitFieldDatat jltype(::JLDFile, ::BitFieldDatatype) = ReadRepresentation{Bool, Bool}() h5fieldtype(::JLDFile, ::Type{Float16}, ::Type{Float16}, ::Initialized) = - FloatingPointDatatype(DT_FLOATING_POINT, 0x20, 0x0f, 0x00, 2, 0, 16, 10, 5, 0, 10, 0x0000000f) + FloatingPointDatatype(UInt8(DT_FLOATING_POINT) + 0x3<<4, 0x20, 0x0f, 0x00, 2, 0, 16, 10, 5, 0, 10, 0x0000000f) h5fieldtype(::JLDFile, ::Type{Float32}, ::Type{Float32}, ::Initialized) = - FloatingPointDatatype(DT_FLOATING_POINT, 0x20, 0x1f, 0x00, 4, 0, 32, 23, 8, 0, 23, 0x0000007f) + FloatingPointDatatype(UInt8(DT_FLOATING_POINT) + 0x3<<4, 0x20, 0x1f, 0x00, 4, 0, 32, 23, 8, 0, 23, 0x0000007f) h5fieldtype(::JLDFile, ::Type{Float64}, ::Type{Float64}, ::Initialized) = - FloatingPointDatatype(DT_FLOATING_POINT, 0x20, 0x3f, 0x00, 8, 0, 64, 52, 11, 0, 52, 0x000003ff) + FloatingPointDatatype(UInt8(DT_FLOATING_POINT) + 0x3<<4, 0x20, 0x3f, 0x00, 8, 0, 64, 52, 11, 0, 52, 0x000003ff) h5fieldtype(::JLDFile, ::Type{BENumber{Float16}}, ::Type{Float16}, ::Initialized) = - FloatingPointDatatype(DT_FLOATING_POINT, 0x21, 0x0f, 0x00, 2, 0, 16, 10, 5, 0, 10, 0x0000000f) + FloatingPointDatatype(UInt8(DT_FLOATING_POINT) + 0x3<<4, 0x21, 0x0f, 0x00, 2, 0, 16, 10, 5, 0, 10, 0x0000000f) h5fieldtype(::JLDFile, ::Type{BENumber{Float32}}, ::Type{Float32}, ::Initialized) = - FloatingPointDatatype(DT_FLOATING_POINT, 0x21, 0x1f, 0x00, 4, 0, 32, 23, 8, 0, 23, 0x0000007f) + FloatingPointDatatype(UInt8(DT_FLOATING_POINT) + 0x3<<4, 0x21, 0x1f, 0x00, 4, 0, 32, 23, 8, 0, 23, 0x0000007f) h5fieldtype(::JLDFile, ::Type{BENumber{Float64}}, ::Type{Float64}, ::Initialized) = - FloatingPointDatatype(DT_FLOATING_POINT, 0x21, 0x3f, 0x00, 8, 0, 64, 52, 11, 0, 52, 0x000003ff) + FloatingPointDatatype(UInt8(DT_FLOATING_POINT) + 0x3<<4, 0x21, 0x3f, 0x00, 8, 0, 64, 52, 11, 0, 52, 0x000003ff) function jltype(f::JLDFile, dt::FloatingPointDatatype) if dt == h5fieldtype(f, Float64, Float64, Val{true}) diff --git a/src/data/reconstructing_datatypes.jl b/src/data/reconstructing_datatypes.jl index ca40bc7c..f55036a7 100644 --- a/src/data/reconstructing_datatypes.jl +++ b/src/data/reconstructing_datatypes.jl @@ -62,8 +62,10 @@ end # jltype is the inverse of h5type, providing a ReadRepresentation for an # H5Datatype. We handle committed datatypes here, and other datatypes below. -function jltype(f::JLDFile, cdt::CommittedDatatype) +function jltype(f::JLDFile, sdt::Union{SharedDatatype,CommittedDatatype}) + cdt = get(f.datatype_locations, sdt.header_offset, sdt) haskey(f.h5jltype, cdt) && return f.h5jltype[cdt]::ReadRepresentation + dt, attrs = read_shared_datatype(f, cdt) julia_type_attr = nothing @@ -75,51 +77,40 @@ function jltype(f::JLDFile, cdt::CommittedDatatype) written_type_attr = attr end end + isnothing(julia_type_attr) && return f.h5jltype[cdt] = jltype(f, dt) - if isa(julia_type_attr, Nothing) - throw(InvalidDataException()) - end - julia_type_attr = julia_type_attr::ReadAttribute - - # If type of datatype is this datatype, then this is the committed - # datatype that describes a datatype - if julia_type_attr.datatype isa SharedDatatype && - julia_type_attr.datatype.header_offset == cdt.header_offset - # Verify that the datatype matches our expectations + # Bootstrap: the datatype of datatype is a datatype + if julia_type_attr.datatype == SharedDatatype(cdt.header_offset) if dt != H5TYPE_DATATYPE - error("""The HDF5 datatype representing a Julia datatype does not match - the expectations of this version of JLD. - - You may need to update JLD to read this file.""") + throw(InternalError("""The HDF5 datatype representing a Julia datatype does not match + the expectations of this version of JLD2. + You may need to update JLD2 to read this file.""")) end f.jlh5type[DataType] = cdt f.datatypes[cdt.index] = dt return (f.h5jltype[cdt] = ReadRepresentation{DataType, DataTypeODR()}()) end + f.plain && return f.h5jltype[cdt] = jltype(f, dt) + datatype = read_attr_data(f, julia_type_attr) - if f.plain && !(datatype isa Upgrade) && !(datatype <: Tuple) - rr = jltype(f, dt) - return f.h5jltype[cdt] = rr - end - if written_type_attr !== nothing + if !isnothing(written_type_attr) # Custom serialization custom_datatype = read_attr_data(f, written_type_attr) read_as = _readas(custom_datatype, datatype) if read_as <: UnknownType @warn("custom serialization of $(typestring(read_as))" * " encountered, but the type does not exist in the workspace; the data will be read unconverted") - rr = (constructrr(f, custom_datatype, dt, attrs)::Tuple{ReadRepresentation,Bool})[1] + rr, _ = constructrr(f, custom_datatype, dt, attrs) canonical = false else - rr, canonical = constructrr(f, custom_datatype, dt, attrs)::Tuple{ReadRepresentation,Bool} - rrty = typeof(rr) - rr = ReadRepresentation{read_as, CustomSerialization{rrty.parameters[1], rrty.parameters[2]}}() - canonical = canonical && writeas(read_as) === custom_datatype + rr, canonical = constructrr(f, custom_datatype, dt, attrs) + rr = ReadRepresentation{read_as, CustomSerialization{typeof(rr).parameters...}}() + canonical &= writeas(read_as) === custom_datatype end else - rr, canonical = constructrr(f, datatype, dt, attrs)::Tuple{ReadRepresentation,Bool} + rr, canonical = constructrr(f, datatype, dt, attrs) end canonical && (f.jlh5type[datatype] = cdt) @@ -128,16 +119,6 @@ function jltype(f::JLDFile, cdt::CommittedDatatype) end -# jltype is the inverse of h5type, providing a ReadRepresentation for an -# H5Datatype. We handle shared datatypes here: ones that were not "committed" by JLD2. -function jltype(f::JLDFile, sdt::SharedDatatype) - haskey(f.h5jltype, sdt) && return f.h5jltype[sdt]::ReadRepresentation - dt, attrs = read_shared_datatype(f, sdt) - rr = jltype(f, dt) - f.h5jltype[sdt] = rr -end - - # Constructs a ReadRepresentation for a given opaque (bitstype) type function constructrr(::JLDFile, T::DataType, dt::BasicDatatype, attrs::Vector{ReadAttribute}) @@ -381,7 +362,7 @@ function types_from_refs(f::JLDFile, ptr::Ptr) # If the reference is to a committed datatype, read the datatype nulldt = CommittedDatatype(UNDEFINED_ADDRESS, 0) cdt = get(f.datatype_locations, ref, nulldt) - res = cdt !== nulldt ? (typeof(jltype(f, cdt)::ReadRepresentation)::DataType).parameters[1] : load_dataset(f, ref) + res = cdt !== nulldt ? eltype(jltype(f, cdt)) : load_dataset(f, ref) unknown_params = unknown_params || isunknowntype(res) || isreconstructed(res) res end for ref in refs] diff --git a/src/datasets.jl b/src/datasets.jl index 69e9f379..92d50072 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -66,26 +66,13 @@ Otherwise, `datatype_offset` points to the offset of the datatype attribute. filters::FilterPipeline=FilterPipeline(), header_offset::RelOffset=NULL_REFERENCE, attributes::Union{Vector{ReadAttribute},Nothing}=nothing) - # See if there is a julia type attribute - io = f.io - if dt isa SharedDatatype - # this means that it is "committed" to `_types` if the file was written by JLD2 - rr = jltype(f, get(f.datatype_locations, dt.header_offset, dt)) - - if layout.data_offset == -1 - # There was no layout message. - # That means, this dataset is just a datatype - # return the Datatype - return typeof(rr).parameters[1] - end - - seek(io, layout.data_offset) - read_dataspace = (dataspace, header_offset, layout, filters) - read_data(f, rr, read_dataspace, attributes) - + rr = jltype(f, dt) + if layout.data_offset == -1 + # There was no layout message. + # That means, this dataset is just a datatype + return typeof(rr).parameters[1] elseif layout.data_offset == typemax(Int64) - rr = jltype(f, dt) - T,S = typeof(rr).parameters + T,_ = typeof(rr).parameters if layout.data_length > -1 # TODO: this could use the fill value message to populate the array @warn "This array should be populated by a fill value. This is not (yet) implemented." @@ -93,21 +80,10 @@ Otherwise, `datatype_offset` points to the offset of the datatype attribute. v = Array{T, 1}() track_weakref!(f, header_offset, v) return v - else - dtt = dt - rr = jltype(f, dtt) - - if layout.data_offset == -1 - # There was no layout message. - # That means, this dataset is just a datatype - # return the Datatype - return typeof(rr).parameters[1] - end - - seek(io, layout.data_offset) - read_dataspace = (dataspace, header_offset, layout, filters) - read_data(f, rr, read_dataspace, attributes) end + seek(f.io, layout.data_offset) + read_dataspace = (dataspace, header_offset, layout, filters) + read_data(f, rr, read_dataspace, attributes) end # Most types can only be scalars or arrays @@ -213,7 +189,7 @@ function read_empty(rr::ReadRepresentation{T}, f::JLDFile, dimensions_attr.datatype == h5fieldtype(f, Int64, Int64, Val{true}) || throw(UnsupportedFeatureException()) seek(io, dimensions_attr.data_offset) - v = construct_array(io, T, Val(ndims)) + v = construct_array(io, T, ndims) if isconcretetype(T) for i = 1:length(v) @inbounds v[i] = jlconvert(rr, f, Ptr{Cvoid}(0), header_offset) @@ -244,30 +220,12 @@ function get_ndims_offset(f::JLDFile, dataspace::ReadDataspace, attributes::Abst end """ - construct_array{T}(io::IO, ::Type{T}, ::Val{ndims}) + construct_array(io::IO, eltype, ndims::Int) Construct array by reading `ndims` dimensions from `io`. Assumes `io` has already been seeked to the correct position. """ -function construct_array(io::IO, ::Type{T}, ::Val{1}) where {T} - n = jlread(io, Int64) - Vector{T}(undef, n) -end - -function construct_array(io::IO, ::Type{T}, ::Val{2}) where {T} - d2 = jlread(io, Int64) - d1 = jlread(io, Int64) - Matrix{T}(undef, d1, d2) -end - -function construct_array(io::IO, ::Type{T}, ::Val{3}) where {T} - d3 = jlread(io, Int64) - d2 = jlread(io, Int64) - d1 = jlread(io, Int64) - Array{T,3}(undef, d1, d2, d3) -end - -function construct_array(io::IO, ::Type{T}, ::Val{N})::Array{T,N} where {T,N} +function construct_array(io::IO, ::Type{T}, N::Int) where {T} ds = reverse(ntuple(i->jlread(io, Int64), Val(N))) Array{T,N}(undef, ds...) end @@ -283,7 +241,7 @@ end ndims, offset = get_ndims_offset(f, dataspace, attributes) seek(io, offset) - v = construct_array(io, T, Val(Int(ndims))) + v = construct_array(io, T, Int(ndims)) n = length(v) seek(io, data_offset) if iscompressed(filters) @@ -296,7 +254,7 @@ end else ndims, offset = get_ndims_offset(f, dataspace, attributes) seek(io, offset) - v = construct_array(io, T, Val(Int(ndims))) + v = construct_array(io, T, Int(ndims)) if layout.version == 3 # version 1 B-tree # This version appears to be padding incomplete chunks @@ -385,14 +343,17 @@ end seek(io, header_offset) f.end_of_data = header_offset + fullsz - if ismutabletype(typeof(data)) && !isa(wsession, JLDWriteSession{Union{}}) - wsession.h5offset[objectid(data)] = h5offset(f, header_offset) - push!(wsession.objects, data) - end + track!(wsession, data, h5offset(f, header_offset)) cio = begin_checksum_write(io, fullsz - 4) - write_object_header_and_dataspace_message(cio, f, psz, dataspace) - write_datatype_message(cio, datatype) + jlwrite(cio, ObjectStart(size_flag(psz))) + write_size(cio, psz) + write_header_message(cio, Val(HmFillValue); flags=0x09) + write_header_message(cio, Val(HmDataspace); dataspace.dataspace_type, dimensions=dataspace.size) + for attr in dataspace.attributes + write_header_message(cio, f, attr) + end + write_header_message(cio, Val(HmDatatype), 1 | (2*isa(datatype, CommittedDatatype)); dt=datatype) # Data storage layout if layout_class == LcCompact @@ -429,25 +390,10 @@ end h5offset(f, header_offset) end -function write_object_header_and_dataspace_message(cio::IO, f::JLDFile, psz::Int, dataspace::WriteDataspace) - jlwrite(cio, ObjectStart(size_flag(psz))) - write_size(cio, psz) - write_header_message(cio, Val(HmFillValue); flags=0x09) - write_header_message(cio, Val(HmDataspace); dataspace.dataspace_type, dimensions=dataspace.size) - for attr in dataspace.attributes - write_header_message(cio, f, attr) - end -end - -write_datatype_message(cio::IO, dt::H5Datatype) = - write_header_message(cio, Val(HmDatatype), 1 | (2*isa(dt, CommittedDatatype)); dt) - @nospecializeinfer function write_dataset(f::JLDFile, @nospecialize(x), wsession::JLDWriteSession)::RelOffset - if ismutabletype(typeof(x)) && !isa(wsession, JLDWriteSession{Union{}}) - offset = get(wsession.h5offset, objectid(x), UNDEFINED_ADDRESS) - offset != UNDEFINED_ADDRESS && return offset - end + offset = get_tracked(wsession, x) + offset != UNDEFINED_ADDRESS && return offset odr = objodr(x) write_dataset(f, WriteDataspace(f, x, odr), h5type(f, x), odr, x, wsession)::RelOffset end diff --git a/src/datatypes.jl b/src/datatypes.jl index 202e65d2..90f77d93 100644 --- a/src/datatypes.jl +++ b/src/datatypes.jl @@ -34,11 +34,11 @@ struct BasicDatatype <: H5Datatype end define_packed(BasicDatatype) StringDatatype(::Type{String}, size::Integer) = - BasicDatatype(DT_STRING, 0x11, 0x00, 0x00, size) + BasicDatatype(UInt8(DT_STRING) | 0x3<<4, 0x11, 0x00, 0x00, size) OpaqueDatatype(size::Integer) = - BasicDatatype(DT_OPAQUE, 0x00, 0x00, 0x00, size) # XXX make sure ignoring the tag is OK + BasicDatatype(UInt8(DT_OPAQUE) | 0x3<<4, 0x00, 0x00, 0x00, size) # XXX make sure ignoring the tag is OK ReferenceDatatype() = - BasicDatatype(DT_REFERENCE, 0x00, 0x00, 0x00, jlsizeof(RelOffset)) + BasicDatatype(UInt8(DT_REFERENCE) | 0x3<<4, 0x00, 0x00, 0x00, jlsizeof(RelOffset)) function Base.:(==)(dt1::BasicDatatype, dt2::BasicDatatype) ret = true @@ -124,7 +124,7 @@ struct BitFieldDatatype <: H5Datatype end define_packed(BitFieldDatatype) BitFieldDatatype(size) = - BitFieldDatatype(DT_BITFIELD, 0x00, 0x00, 0x00, size, 0, 8*size) + BitFieldDatatype(UInt8(DT_BITFIELD) | 0x3<<4, 0x00, 0x00, 0x00, size, 0, 8*size) struct FloatingPointDatatype <: H5Datatype @@ -198,7 +198,7 @@ end function jlwrite(io::IO, dt::CompoundDatatype) n = length(dt.names) - jlwrite(io, BasicDatatype(DT_COMPOUND, n % UInt8, (n >> 8) % UInt8, 0x00, dt.size)) + jlwrite(io, BasicDatatype(UInt8(DT_COMPOUND) | 0x3<<4, n % UInt8, (n >> 8) % UInt8, 0x00, dt.size)) for i = 1:length(dt.names) # Name name = dt.names[i] @@ -273,7 +273,7 @@ struct VariableLengthDatatype{T<:H5Datatype} <: H5Datatype basetype::T end VariableLengthDatatype(basetype::H5Datatype) = - VariableLengthDatatype{typeof(basetype)}(DT_VARIABLE_LENGTH, 0x00, 0x00, 0x00, 8+jlsizeof(RelOffset), basetype) + VariableLengthDatatype{typeof(basetype)}(UInt8(DT_VARIABLE_LENGTH) | 0x3<<4, 0x00, 0x00, 0x00, 8+jlsizeof(RelOffset), basetype) VariableLengthDatatype(class, bitfield1, bitfield2, bitfield3, size, basetype::H5Datatype) = VariableLengthDatatype{typeof(basetype)}(class, bitfield1, bitfield2, bitfield3, size, basetype) @@ -288,7 +288,7 @@ jlsizeof(dt::VariableLengthDatatype) = jlsizeof(BasicDatatype) + jlsizeof(dt.basetype) function jlwrite(io::IO, dt::VariableLengthDatatype) - jlwrite(io, BasicDatatype(DT_VARIABLE_LENGTH, dt.bitfield1, dt.bitfield2, dt.bitfield3, dt.size)) + jlwrite(io, BasicDatatype(UInt8(DT_VARIABLE_LENGTH) | 0x3<<4, dt.bitfield1, dt.bitfield2, dt.bitfield3, dt.size)) jlwrite(io, dt.basetype) end diff --git a/src/explicit_datasets.jl b/src/explicit_datasets.jl index f4d782ab..1616a516 100644 --- a/src/explicit_datasets.jl +++ b/src/explicit_datasets.jl @@ -118,12 +118,11 @@ end Write data to file using metadata prepared in the `dataset`. """ -function write_dataset(dataset::Dataset, data) +function write_dataset(dataset::Dataset, data, wsession::JLDWriteSession=JLDWriteSession()) f = dataset.parent.f if dataset.offset != UNDEFINED_ADDRESS throw(ArgumentError("Dataset has already been written to file")) end - wsession = JLDWriteSession() # first need to figure out if data type and dataspace are defined / correct if isnothing(dataset.datatype) dataset.datatype = h5type(f, data) @@ -134,103 +133,29 @@ function write_dataset(dataset::Dataset, data) dataset.dataspace = WriteDataspace(f, data, odr) end dataspace = dataset.dataspace - # Attributes - attributes = map(collect(dataset.attributes)) do (name, attr) - attr isa WrittenAttribute && return attr - return WrittenAttribute(dataset.parent.f, name, attr) - throw(ArgumentError("Invalid attribute: $a")) - end - io = f.io - odr = objodr(data) - datasz = odr_sizeof(odr)::Int * numel(dataspace)::Int - - psz = payload_size_without_storage_message(dataspace, datatype)::Int - - psz += sum(message_size.(attributes), init=0) - - # minimum extra space for continuation message - psz += jlsizeof(HeaderMessage) + jlsizeof(RelOffset) + jlsizeof(Length) - - - # determine layout class - # DataLayout object is only available after the data is written - if datasz == 0 || (!(data isa Array) && datasz < 8192) - layout_class = LcCompact - psz += jlsizeof(Val(HmDataLayout); layout_class, data_size=datasz) - elseif !isnothing(dataset.chunk) || !isempty(dataset.filters.filters) + if !isempty(dataset.filters.filters) filter_id = dataset.filters.filters[1].id invoke_again, compressor = get_compressor(filter_id) if invoke_again return Base.invokelatest(write_dataset, dset, data)::RelOffset end - # Do some additional checks on the data here - layout_class = LcChunked - # improve filter support here - psz += chunked_storage_message_size(ndims(data)) + pipeline_message_size(filter_id::UInt16) else - layout_class = LcContiguous - psz += jlsizeof(Val(HmDataLayout); layout_class) - end - fullsz = jlsizeof(ObjectStart) + size_size(psz) + psz + 4 - - header_offset = f.end_of_data - seek(io, header_offset) - f.end_of_data = header_offset + fullsz - - if ismutabletype(typeof(data)) && !isa(wsession, JLDWriteSession{Union{}}) - wsession.h5offset[objectid(data)] = h5offset(f, header_offset) - push!(wsession.objects, data) + compressor = nothing end - - cio = begin_checksum_write(io, fullsz - 4) - write_object_header_and_dataspace_message(cio, f, psz, dataspace) - write_datatype_message(cio, datatype) - for a in attributes - write_header_message(cio, f, a, wsession) - end - # Data storage layout - if layout_class == LcCompact - write_header_message(cio, Val(HmDataLayout); layout_class, data_size=datasz) - if datasz != 0 - write_data(cio, f, data, odr, datamode(odr), wsession) - end - dataset.header_chunk_info = (header_offset, position(cio)+20, position(cio)) - # Add NIL message replacable by continuation message - write_continuation_placeholder(cio) - jlwrite(io, end_checksum(cio)) - elseif layout_class == LcChunked - write_filter_pipeline_message(cio, filter_id) - - # deflate first - deflated = deflate_data(f, data, odr, wsession, compressor) - - write_chunked_storage_message(cio, odr_sizeof(odr), size(data), length(deflated), h5offset(f, f.end_of_data)) - dataset.header_chunk_info = (header_offset, position(cio)+20, position(cio)) - write_continuation_placeholder(cio) - jlwrite(f.io, end_checksum(cio)) - - seek(f.io, f.end_of_data) - f.end_of_data += length(deflated) - jlwrite(f.io, deflated) - else - # Align contiguous chunk to 8 bytes in the file - address = f.end_of_data + 8 - mod1(f.end_of_data, 8) - data_address = h5offset(f, address) - write_header_message(cio, Val(HmDataLayout); - layout_class, data_address, data_size=datasz) - - dataset.header_chunk_info = (header_offset, position(cio)+20, position(cio)) - # Add NIL message replacable by continuation message - write_continuation_placeholder(cio) - jlwrite(io, end_checksum(cio)) - - f.end_of_data = address + datasz - seek(io, address) - write_data(io, f, data, odr, datamode(odr), wsession) + offset = write_dataset(f, dataspace, datatype, odr, data, wsession, compressor) + !isempty(dataset.name) && (dataset.parent[dataset.name] = offset) + # Attributes + attrs = map(collect(keys(pairs(dataset.attributes)))) do name + WrittenAttribute(f, name, dataset.attributes[name]) end + dataset = get_dataset(f, offset, dataset.parent, dataset.name) + dataset.header_chunk_info = + attach_message(f, dataset.offset, attrs, wsession; + chunk_start=dataset.header_chunk_info[1], + chunk_end=dataset.header_chunk_info[2], + next_msg_offset=dataset.header_chunk_info[3], + ) - offset = h5offset(f, header_offset) - !isempty(dataset.name) && (dataset.parent[dataset.name] = offset) return offset end @@ -247,7 +172,7 @@ function read_dataset(dset::Dataset) DataLayout(f, dset.layout), isnothing(dset.filters) ? FilterPipeline() : dset.filters, dset.offset, - collect(values(dset.attributes))) + collect(ReadAttribute, values(dset.attributes))) end """ @@ -301,11 +226,6 @@ function get_dataset(f::JLDFile, offset::RelOffset, g=f.root_group, name="") return dset end -function add_attribute(dset::Dataset, name::String, data::Dataset) - # link an existing dataset as attribute - throw(UnsupportedFeatureException("Not implemented")) -end - # Attributes message_size(msg::WrittenAttribute) = jlsizeof(HeaderMessage) + jlsizeof(msg) function write_header_message(io,f::JLDFile, msg::WrittenAttribute, wsession=JLDWriteSession()) @@ -426,9 +346,9 @@ function add_attribute(dset::Dataset, name::String, data, wsession=JLDWriteSessi f = dset.parent.f prewrite(f) # assert writability - for attr in dset.attributes - if (attr isa ReadAttribute && attr.name == name) || (attr isa Pair && attr.first == name) - throw(ArgumentError("Attribute $name already exists. Attribute names must be unique.")) + for attrname in keys(dset.attributes) + if name == attrname + throw(ArgumentError("Attribute \"$name\" already exists. Attribute names must be unique.")) end end dset.attributes[name] = data @@ -472,11 +392,7 @@ function ismmappable(dset::Dataset) iswritten(dset) || return false f = dset.parent.f dt = dset.datatype - if dt isa SharedDatatype - rr = jltype(f, get(f.datatype_locations, dt.header_offset, dt)) - else - rr = jltype(f, dt) - end + rr = jltype(f, dt) T = typeof(rr).parameters[1] !(samelayout(T)) && return false !isempty(dset.filters.filters) && return false @@ -504,11 +420,7 @@ function readmmap(dset::Dataset) # figure out the element type dt = dset.datatype - if dt isa SharedDatatype - rr = jltype(f, get(f.datatype_locations, dt.header_offset, dt)) - else - rr = jltype(f, dt) - end + rr = jltype(f, dt) T = typeof(rr).parameters[1] ndims, offset = get_ndims_offset(f, ReadDataspace(f, dset.dataspace), collect(values(dset.attributes))) @@ -562,8 +474,14 @@ function allocate_early(dset::Dataset, T::DataType) f.end_of_data = header_offset + fullsz cio = begin_checksum_write(io, fullsz - 4) - write_object_header_and_dataspace_message(cio, f, psz, dataspace) - write_datatype_message(cio, datatype) + jlwrite(cio, ObjectStart(size_flag(psz))) + write_size(cio, psz) + write_header_message(cio, Val(HmFillValue); flags=0x09) + write_header_message(cio, Val(HmDataspace); dataspace.dataspace_type, dimensions=dataspace.size) + for attr in dataspace.attributes + write_header_message(cio, f, attr) + end + write_header_message(cio, Val(HmDatatype), 1 | (2*isa(datatype, CommittedDatatype)); dt=datatype) for a in attributes write_header_message(cio, f, a, wsession) end @@ -592,4 +510,53 @@ function allocate_early(dset::Dataset, T::DataType) end return offset end -end \ No newline at end of file +end + +struct ArrayDataset{T, N, ODR, io} <: AbstractArray{T, N} + f::JLDFile{io} + dset::Dataset + dims::NTuple{N, Int} + data_address::Int64 + rr::ReadRepresentation{T, ODR} +end +function ArrayDataset(dset::Dataset) + isarraydataset(dset) || throw(ArgumentError("Dataset is not an array")) + iscompressed(dset.filters) && throw(UnsupportedFeatureException("Compressed datasets are not supported.")) + f = dset.parent.f + dt = dset.datatype + return ArrayDataset( + f, dset, + Int.(reverse(dset.dataspace.dimensions)), + fileoffset(f, dset.layout.data_address), + jltype(f, !(f.plain) && dt isa SharedDatatype ? get(f.datatype_locations, dt.header_offset, dt) : dt) + ) +end + +function isarraydataset(dset::Dataset) + isnothing(dset.dataspace) && return false + ds = dset.dataspace + if ds isa HmWrap{HmDataspace} + return ds.dataspace_type == DS_SIMPLE || ds.dataspace_type == DS_V1 + end + return false +end + +Base.IndexStyle(::Type{<:ArrayDataset}) = IndexLinear() +Base.size(A::ArrayDataset) = A.dims +Base.getindex(dset::Dataset, I...) = ArrayDataset(dset)[I...] +Base.getindex(dset::Dataset) = read_dataset(dset) +Base.setindex!(dset::Dataset, v, i, I...) = Base.setindex!(ArrayDataset(dset), v, i, I...) + +function Base.getindex(A::ArrayDataset, i::Integer) + @boundscheck checkbounds(A, i) + seek(A.f.io, A.data_address + (i-1)*odr_sizeof(A.rr)) + return read_scalar(A.f, A.rr, UNDEFINED_ADDRESS) +end + +function Base.setindex!(A::ArrayDataset{T,N,ODR}, v, i::Integer) where {T,N,ODR} + @boundscheck checkbounds(A, i) + A.f.writable || throw(ArgumentError("Cannot edit in read-only mode")) + seek(A.f.io, A.data_address + (i-1)*odr_sizeof(A.rr)) + write_data(A.f.io, A.f, v, T, datamode(ODR), JLDWriteSession()) + return v +end diff --git a/src/inlineunion.jl b/src/inlineunion.jl index a9bd7b37..e84debdb 100644 --- a/src/inlineunion.jl +++ b/src/inlineunion.jl @@ -35,10 +35,8 @@ end @nospecializeinfer function write_dataset(f::JLDFile, @nospecialize(x::Array), wsession::JLDWriteSession, @nospecialize(compress=f.compress)) T = eltype(x) - if !isa(wsession, JLDWriteSession{Union{}}) - offset = get(wsession.h5offset, objectid(x), UNDEFINED_ADDRESS) - offset != UNDEFINED_ADDRESS && return offset - end + offset = get_tracked(wsession, x) + offset != UNDEFINED_ADDRESS && return offset if T isa Union && writeasbits(T) # Conversion has to be done earlier here because # vectors are special cased in dispatch @@ -60,7 +58,7 @@ function read_array(f::JLDFile, dataspace::ReadDataspace, io = f.io ndims, offset = get_ndims_offset(f, dataspace, attributes) seek(io, offset) - v = construct_array(io, InlineUnionEl{T1,T2}, Val(Int(ndims))) + v = construct_array(io, InlineUnionEl{T1,T2}, Int(ndims)) n = length(v) seek(io, layout.data_offset) if iscompressed(filters) diff --git a/src/macros_utils.jl b/src/macros_utils.jl index 526a4e2f..80ee8c2d 100644 --- a/src/macros_utils.jl +++ b/src/macros_utils.jl @@ -114,7 +114,7 @@ function linefun(ex) increment = esc(n) off_inc = :($offset += $increment) write_inc = :(write_zerobytes(io, $increment)) - return [write_inc, off_inc, off_inc] + return [write_inc, off_inc, :(skip($io, $increment))] elseif @capture(ex, s_Symbol::T_) || @capture(ex, s_Symbol::T_ = v_) getprop_ = :($(esc(s)) = $kw.$(s)) default = Symbol(s,"_default") diff --git a/src/types.jl b/src/types.jl index 28e5fb3a..c9e7ad77 100644 --- a/src/types.jl +++ b/src/types.jl @@ -114,14 +114,24 @@ referenced multiple times are written multiple times. """ struct JLDWriteSession{T<:Union{Dict{UInt,RelOffset},Union{}}} h5offset::T - objects::Vector{Any} - JLDWriteSession{T}() where T = new() - JLDWriteSession{T}(h5offset, objects) where T = new(h5offset, objects) + JLDWriteSession{T}(h5offset, objects) where T = new(h5offset) end JLDWriteSession() = JLDWriteSession{Dict{UInt,RelOffset}}(Dict{UInt,RelOffset}(), Any[]) - - +track!(::JLDWriteSession{Union{}}, args...) = nothing +function track!(s::JLDWriteSession, data, offset::RelOffset) + if ismutabletype(typeof(data)) + s.h5offset[objectid(data)] = offset + end + nothing +end +get_tracked(wsession::JLDWriteSession{Union{}}, data) = UNDEFINED_ADDRESS +function get_tracked(wsession::JLDWriteSession, data) + if ismutabletype(typeof(data)) + return get(wsession.h5offset, objectid(data), UNDEFINED_ADDRESS) + end + return UNDEFINED_ADDRESS +end """ GlobalHeap diff --git a/test/dataset_api.jl b/test/dataset_api.jl index b82a9b65..929899cf 100644 --- a/test/dataset_api.jl +++ b/test/dataset_api.jl @@ -31,4 +31,36 @@ using JLD2, Test end @test load(fn)["d"] == zeros(1000,1000) end +end + +@testset "Slicing & Updating" begin + cd(mktempdir()) do + fn = "test.jld2" + jldsave(fn; a=42, b = [42 43 44; 45 46 47], c = [(0x00, 1f0), (0x42, 2f0)]) + jldopen(fn) do f + dset = JLD2.get_dataset(f, "a") + @test dset[] == 42 + + dset = JLD2.get_dataset(f, "b") + @test dset[] == [42 43 44; 45 46 47] + @test dset[1] == 42 + @test dset[1,1] == 42 + @test dset[1:2, 1:2] == [42 43; 45 46] + @test dset[1,1:2:3] == [42, 44] + @test_throws BoundsError dset[7] + @test_throws BoundsError dset[2,4] + @test_throws ArgumentError dset[1] = 1 + end + jldopen(fn, "a") do f + dset = JLD2.get_dataset(f, "b") + dset[2] = -1 + @test dset[] == [42 43 44; -1 46 47] + dset[1,1:2:3] = [1,5] + @test dset[] == [1 43 5; -1 46 47] + + dset = JLD2.get_dataset(f, "c") + dset[2] = (0xff, 0f0) + @test f["c"] == [(0x00, 1f0), (0xff, 0f0)] + end + end end \ No newline at end of file