diff --git a/R/check-office.R b/R/check-office.R new file mode 100644 index 0000000..4b0bbed --- /dev/null +++ b/R/check-office.R @@ -0,0 +1,32 @@ +check_office <- function(hdr, path) { + + # [Content_Types.xml] || length 19 + c( + 0x5b,0x43,0x6f,0x6e,0x74,0x65,0x6e,0x74,0x5f,0x54, + 0x79,0x70,0x65,0x73,0x5d,0x2e,0x78,0x6d,0x6c + ) -> pat_content_types + + # _rels/.rels || length 11 + pat_rels <- c(0x5f,0x72,0x65,0x6c,0x73,0x2f,0x2e,0x72,0x65,0x6c,0x73) + + if ((all(pat_content_types == hdr[31:49])) || (all(pat_rels == hdr[31:41]))) { + + hdr <- readBin(path, "raw", n=4096) + + pat_word <- c(0x77,0x6f,0x72,0x64,0x2f) + if (length(seq_in(hdr, pat_word)) > 0) + return("application/vnd.openxmlformats-officedocument.wordprocessingml.document") + + pat_ppt <- c(0x70,0x70,0x74,0x2f) + if (length(seq_in(hdr, pat_ppt)) > 0) + return("application/vnd.openxmlformats-officedocument.presentationml.presentation") + + pat_xl <- c(0x78,0x6c,0x2f) + if (length(seq_in(hdr, pat_xl)) > 0) + return("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") + + } + + return(NULL) + +} diff --git a/R/get-content-type.R b/R/get-content-type.R index 1e932b1..cad1721 100644 --- a/R/get-content-type.R +++ b/R/get-content-type.R @@ -92,9 +92,15 @@ get_content_type <- function(path) { if (all(c(0x52,0x49,0x46,0x46) == hdr[1:4])) return("audio/x-wav") # "RIFF" if (all(c(0x50, 0x4b) == hdr[1:2])) { # "PK" + + office_type <- check_office(hdr, path) + if (length(office_type) > 0) return(office_type) + guessed_name <- guess_content_type(path) if ((length(guessed_name) == 1) && (guessed_name != "???")) return(guessed_name) + return("application/zip") + } if (all(c(0x5a,0x4d) == hdr[1:2])) return("x-system/exe") diff --git a/R/util.R b/R/util.R new file mode 100644 index 0000000..942b514 --- /dev/null +++ b/R/util.R @@ -0,0 +1,13 @@ +seq_in <- function(a, b) { + + which( + Reduce( + '+', + lapply( + seq_along(y <- lapply(b, '==', a)), + function(x) { y[[x]][x:(length(a) - length(b) +x)] } + ) + ) == length(b) + ) + +}