From bcc191fbf87cd0ef0538761fa29079ab44e55ca9 Mon Sep 17 00:00:00 2001 From: l-k- Date: Tue, 28 Nov 2023 23:30:18 -0500 Subject: [PATCH] handle failed checksum in various parts of file --- R/readAxivity.R | 182 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 150 insertions(+), 32 deletions(-) diff --git a/R/readAxivity.R b/R/readAxivity.R index ec52ec7..682ef70 100755 --- a/R/readAxivity.R +++ b/R/readAxivity.R @@ -4,6 +4,8 @@ readAxivity = function(filename, start = 0, end = 0, progressBar = FALSE, desire if (length(configtz) == 0) configtz = desiredtz blockBytes = 512 headerBytes = 1024 + maxAllowedCorruptBlocks = 20 # max number consecutive blocks with a failed checksum that we'll tolerate + # Credits: The original version of this code developed outside GitHub was # contributed by Dr. Evgeny Mirkes (Leicester University, UK) #======================================================================== @@ -75,6 +77,28 @@ readAxivity = function(filename, start = 0, end = 0, progressBar = FALSE, desire return(NULL) } + # sampling rate in one of file format U8 at offset 24 + samplerate_dynrange = readBin(block[25], integer(), size = 1, signed = FALSE) + + if (samplerate_dynrange != 0) { # Very old files that have zero at offset 24 don't have a checksum + checksum = sum(readBin(block, n = 256, + integer(), + size = 2, + signed = FALSE, + endian = "little")) + checksum = checksum %% 65536 # 65536 = 2^16; the checksum is calculated as a 16-bit integer + if (checksum != 0) { + # Checksum doesn't match. This means some bits in this block got corrupted. + # We don't know which, so we can't trust this block. We skip it, and impute it later. + rawdata_list = list( + struc = struc, + parameters = parameters, + checksum_pass = FALSE + ) + return(invisible(rawdata_list)) + } + } + idstr = readChar(block, 2, useBytes = TRUE) if (idstr != "AX") { stop("Packet header is incorrect. First two characters must be AX.") @@ -114,22 +138,6 @@ readAxivity = function(filename, start = 0, end = 0, progressBar = FALSE, desire } else { battery = 0 } - # sampling rate in one of file format U8 at offset 24 - samplerate_dynrange = readBin(block[25], integer(), size = 1, signed = FALSE) - - checksum_pass = TRUE - if (samplerate_dynrange != 0) { # Very old files that have zero at offset 24 don't have a checksum - # Perform checksum - checksum = sum(readBin(block, n = 256, - integer(), - size = 2, - signed = FALSE, - endian = "little")) - checksum = checksum %% 65536 # equals 2^16 the checksum is calculated on a 16bit integer - if (checksum != 0) { - checksum_pass = FALSE - } - } # offset 25, per documentation: # "top nibble: number of axes, 3=Axyz, 6=Gxyz/Axyz, 9=Gxyz/Axyz/Mxyz; @@ -244,7 +252,7 @@ readAxivity = function(filename, start = 0, end = 0, progressBar = FALSE, desire length = blockLength, struc = struc, parameters = parameters, - checksum_pass = checksum_pass, + checksum_pass = TRUE, blockID = blockID ) if (complete) { @@ -302,14 +310,33 @@ readAxivity = function(filename, start = 0, end = 0, progressBar = FALSE, desire accrange = bitwShiftR(16, (bitwShiftR(samplerate_dynrange, 6))) version = readBin(block[42], integer(), size = 1, signed = FALSE) #offset 41 - # Read the first data block without data - datas = readDataBlock(fid, complete = FALSE) - if (is.null(datas)) { - stop("Error reading the first data block.") + # Read the first data block without data. + # Skip any corrupt blocks, up to maxAllowedCorruptBlocks in number. + is_corrupt = TRUE + for (ii in 0:min(numDBlocks-1, maxAllowedCorruptBlocks)) { + datas = readDataBlock(fid, complete = FALSE) + + if (is.null(datas)) { + stop("Error reading the first data block.") + } + + if (datas$checksum_pass) { + is_corrupt = FALSE + break + } + + warning("Skipping corrupt block #", ii) + } + if (is_corrupt) { + if (ii==numDBlocks-1) { + stop("Error reading file. Every block is corrupt.") + } + stop("Error reading file. The first ", maxAllowedCorruptBlocks+1, " blocks are corrupt.") } + if (frequency_header != datas$frequency) { warning("Inconsistent value of measurement frequency: there is ", - frequency_header, " in header and ", datas$frequency, " in the first data block ") + frequency_header, " in header and ", datas$frequency, " in the first data block. ") } blockLength = datas$length # number of samples in a block @@ -340,6 +367,9 @@ readAxivity = function(filename, start = 0, end = 0, progressBar = FALSE, desire on.exit({ close(fid) }) + + QClog = NULL # Initialise log of data quality issues + ############################################################################# # read header if (is.null(header)) { @@ -368,14 +398,59 @@ readAxivity = function(filename, start = 0, end = 0, progressBar = FALSE, desire struc = list(0,0L,0) if (end < numDBlocks) { # the end block isn't part of the data we'll read, but its start will be our ending timestamp seek(fid, headerBytes + blockBytes * end, origin = 'start') - endBlock = readDataBlock(fid, struc = struc) + + # Skip any corrupt blocks, up to maxAllowedCorruptBlocks in number. + # Skip forward, so we'll end up reading more blocks than requested. + # This is important because when corrupt blocks are in the beginning of the requested interval, + # we also skip forward, ending up with fewer blocks than requested. + # for example, if we have a file with corrupt blocks 8-12, and we read the file 10 blocks at a time, + # then the 1st request will end up with blocks 1-12, and the 2nd request with blocks 13-20, + # and between the two requests, all 20 blocks will be accounted for. + is_corrupt = TRUE + for (ii in end : min(numDBlocks-1, end+maxAllowedCorruptBlocks)) { + endBlock = readDataBlock(fid, struc = struc) + + if (endBlock$checksum_pass) { + is_corrupt = FALSE + break + } + + warning("Skipping corrupt end block #", ii) + end = end + 1 + } + if (is_corrupt && ii == end+maxAllowedCorruptBlocks) { + stop("Error reading file. The last ", maxAllowedCorruptBlocks+1, " blocks are corrupt.") + } endTimestamp = as.numeric(endBlock$start) - } else { + } + + if (end == numDBlocks) { # end == numDBlocks, meaning we'll be reading all the remaining blocks. # There is no block #numDBlocks, so we can't get the ending timestamp from the start of that block. - # Instead read the very last block of the file, and project what the ending timestamp should be. - seek(fid, headerBytes + blockBytes * (end-1), origin = 'start') - lastBlock = readDataBlock(fid, struc = struc) + # Instead read the very last block of the file (if the last block is corrupt, fing the last non-corrupt one), + # then project what the ending timestamp should be. + + # Skip any corrupt blocks, up to maxAllowedCorruptBlocks in number. + # SInce we are at the last block, we have to go backwards. + is_corrupt = TRUE + for (ii in (end-1) : max(start, end-maxAllowedCorruptBlocks-1)) { + seek(fid, headerBytes + blockBytes * ii, origin = 'start') + lastBlock = readDataBlock(fid, struc = struc) + + if (lastBlock$checksum_pass) { + is_corrupt = FALSE + break + } + + warning("Skipping corrupt end block #", ii) + end = end - 1 + } + if (is_corrupt) { + if (start == end) { + stop("Error reading file. All requested blocks are corrupt.") + } + stop("Error reading file. The last ", maxAllowedCorruptBlocks+1, " blocks are corrupt.") + } # the end timestamp should fall right after the actual very last timestamp of the file endTimestamp = as.numeric(lastBlock$start) + blockLength * step # now pad it generously in case there are gaps in the last block @@ -386,7 +461,36 @@ readAxivity = function(filename, start = 0, end = 0, progressBar = FALSE, desire # Reinitiate file and skip header as well as the initial start-1 blocks seek(fid, headerBytes + blockBytes * start, origin = 'start') pos = 1 # position of the first element to complete in data - prevRaw = readDataBlock(fid, struc = struc) + + # Skip any corrupt blocks, up to maxAllowedCorruptBlocks in number. + is_corrupt = TRUE + for (ii in 1 : min((end-start), maxAllowedCorruptBlocks+1)) { + prevRaw = readDataBlock(fid, struc = struc) + + if (prevRaw$checksum_pass) { + is_corrupt = FALSE + break + } + QClog = rbind(QClog, data.frame(checksum_pass = FALSE, + blockID_current = start, # the actual block ID wasn't read b/c the block is corrupt + blockID_next = start+1, + start = 0, + end = 0, + blockLengthSeconds = 0, + frequency_blockheader = 0, + frequency_observed = 0, + imputed = FALSE)) + + warning("Skipping corrupt start block #", start) + start = start + 1 + } + if (is_corrupt) { + if (start == end) { + stop("Error reading file. All requested blocks are corrupt.") + } + stop("Error reading file. The first ", maxAllowedCorruptBlocks+1, " blocks are corrupt.") + } + if (is.null(prevRaw)) { return(invisible(list(header = header, data = NULL))) } @@ -414,8 +518,6 @@ readAxivity = function(filename, start = 0, end = 0, progressBar = FALSE, desire rawTime = vector(mode = "numeric", blockLength + 2) rawPos = 1 - QClog = NULL # Initialise log of data quality issues - # Read the data for (ii in (start+1):end) { if (ii == numDBlocks) { @@ -431,6 +533,23 @@ readAxivity = function(filename, start = 0, end = 0, progressBar = FALSE, desire } else { # read a new block raw = readDataBlock(fid, struc = struc, parameters = prevRaw$parameters) + if (!raw$checksum_pass) { + # If the checksum doesn't match, we can't trust any of this block's data, + # so we have to completely skip the block. + # Depending on the nature of the faulty block, the data for the time period it represented + # will probably get imputed later, once we encounter a block with a valid checksum. + QClog = rbind(QClog, data.frame(checksum_pass = FALSE, + blockID_current = ii, # the actual block ID wasn't read b/c the block is corrupt + blockID_next = ii + 1, + start = 0, + end = 0, + blockLengthSeconds = 0, + frequency_blockheader = 0, + frequency_observed = 0, + imputed = FALSE)) + warning("Skipping corrupt block #", ii) + next + } if (is.null(raw)) { # this shouldn't happen @@ -472,7 +591,6 @@ readAxivity = function(filename, start = 0, end = 0, progressBar = FALSE, desire if ((ii < numDBlocks && raw$blockID != 0 && raw$blockID - prevRaw$blockID != 1) || - prevRaw$checksum_pass == FALSE || frequency_bias > frequency_tol) { # Log and impute this event doQClog = TRUE @@ -499,7 +617,7 @@ readAxivity = function(filename, start = 0, end = 0, progressBar = FALSE, desire } if (doQClog == TRUE) { # Note: This is always a description of the previous block - QClog = rbind(QClog, data.frame(checksum_pass = prevRaw$checksum_pass, + QClog = rbind(QClog, data.frame(checksum_pass = TRUE, blockID_current = prevRaw$blockID, blockID_next = raw$blockID, start = prevRaw$start,