Skip to content
This repository has been archived by the owner on Jul 19, 2019. It is now read-only.

Commit

Permalink
more magic types and better docs
Browse files Browse the repository at this point in the history
  • Loading branch information
hrbrmstr committed Mar 22, 2018
1 parent 5037d21 commit f56737d
Show file tree
Hide file tree
Showing 12 changed files with 112 additions and 12 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ Maintainer: Bob Rudis <[email protected]>
Description: 'MIME' types are shorthand descriptors for file contents and can be
determined from "magic" bytes in file headers, file contents or intuited from
file extensions. Tools are provided to perform limited "magic" tests as well
as mapping 'MIME' types from a database of over 1,500 extension mappings.
as mapping 'MIME' types from a database of over 1,500 extension mappings. It is
useful as a more portable alternative to the 'wand' package.
URL: https://github.com/hrbrmstr/simplemagic
BugReports: https://github.com/hrbrmstr/simplemagic/issues
Encoding: UTF-8
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@

export(get_content_type)
export(guess_content_type)
export(simplemagic_mime_db)
importFrom(tools,file_ext)
6 changes: 5 additions & 1 deletion R/aaa.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
#' File extension-to-MIME mapping data frame
#'
#' @docType data
#' @export
structure(list(extension = c("pyc", "dwg", "ez", "aw", "arj",
"atom", "xml", "atomcat", "atomsvc", "mm", "mme", "hqx", "hqx",
"boo", "book", "ccxml", "cdf", "cdmia", "cdmic", "cdmid", "cdmio",
Expand Down Expand Up @@ -813,4 +817,4 @@ structure(list(extension = c("pyc", "dwg", "ez", "aw", "arj",
"text/x-pascal", "video/jpm", "video/mj2", "video/mp4", "video/mpeg",
"video/quicktime", "video/vnd.mpegurl", "video/x-ms-asf")), row.names = c(NA,
-1763L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("extension",
"mime_type")) -> .ext_to_mime
"mime_type")) -> simplemagic_mime_db
32 changes: 31 additions & 1 deletion R/get-content-type.R
Original file line number Diff line number Diff line change
@@ -1,18 +1,26 @@
#' Discover MIME type of a file based on contents
#'
#' There are a limited number of header "magic" bytes checked directly by
#' this function but cover quite a bit of ground. After that, [guess_content_type()] is called which uses
#' file extension-to-MIME mappings. File an issue or PR if more magic-byte-level
#' comparisons are required/desired. If no match is found, `???` is returned.
#'
#' @details
#' Initial in-R header mapping logic borrowed from `MimeTypes.java` from
#' [`servoy-client`](https://github.com/Servoy/servoy-client)
#'
#' @md
#' @param path path to a file
#' @return character vector
#' @export
#' @examples
#' get_content_type(system.file("extdat", "test.pdf", package="simplemagic"))
get_content_type <- function(path) {

path <- path.expand(path)
if (!file.exists(path)) stop("File not found.", call.=FALSE)

hdr <- readBin(path, "raw", n=11)
hdr <- readBin(path, "raw", n=1024)

if (all(c(0xCA,0xFE,0xBA,0xBE) == hdr[1:4])) return("application/java-vm")

Expand All @@ -23,6 +31,7 @@ get_content_type <- function(path) {
}

if (all(c(0x25,0x50,0x44,0x46,0x2d,0x31,0x2e) == hdr[1:7])) return("application/pdf")
if (all(c(0x25,0x50,0x44,0x46) == hdr[1:4])) return("application/x-pdf")

if (all(c(0x38,0x42,0x50,0x53,0x00,0x01) == hdr[1:6])) return("image/photoshop")

Expand Down Expand Up @@ -54,6 +63,19 @@ get_content_type <- function(path) {
if (all(c(0x21,0x20,0x58,0x50,0x4d,0x32) == hdr[1:6])) return("image/x-pixmap")
if (all(c(137,80,78,71,13,10,26,10) == hdr[1:8])) return("image/png")

if (all(c(0x23,0x21,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65) == hdr[1:11]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65,0x6a,0x73) == hdr[1:13]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65) == hdr[1:15]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65,0x6a,0x73) == hdr[1:17]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x65,0x6e,0x76,0x20,0x6e,0x6f,0x64,0x65) == hdr[1:19]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x65,0x6e,0x76,0x20,0x6e,0x6f,0x64,0x65,0x6a,0x73) == hdr[1:21]))
return("application/javascript")

if (all(c(0xFF,0xD8,0xFF) == hdr[1:3])) {
if (0xE0 == hdr[4]) return("image/jpeg")
if (0xE1 == hdr[4]) {
Expand All @@ -75,6 +97,14 @@ get_content_type <- function(path) {
return("application/zip")
}

if (all(c(0x5a,0x4d) == hdr[1:2])) return("x-system/exe")

if (all(c(0x75,0x73,0x74,0x61,0x72) == hdr[258:262])) return("application/pax")

if (all(c(0x00,0x00,0x01,0xBA) == hdr[1:4])) return("video/mpeg")
if (all(c(0x00,0x00,0x01,0xB3) == hdr[1:4])) return("video/mpeg")


return(guess_content_type(path))

}
9 changes: 8 additions & 1 deletion R/guess-content-type.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#' Guess MIME type from filename (extension)
#'
#' Uses an internal database of over 1,500 file extension-to-MIME mappings to
#' return one or more associated types for a given input path. If no match is
#' found, `???` is returned.
#'
#' @details
#' Incorporates standard IANA MIME extension mappings and those from
#' [`servoy-client`](https://github.com/Servoy/servoy-client) and
Expand All @@ -8,15 +12,18 @@
#'
#' @md
#' @param path path to file
#' @return character vector
#' @export
#' @examples
#' guess_content_type(system.file("extdat", "test.pdf", package="simplemagic"))
guess_content_type <- function(path) {

path <- path.expand(path)
if (!file.exists(path)) stop("File not found.", call.=FALSE)

extension <- trimws(tolower(tools::file_ext(path)))

res <- .ext_to_mime[(.ext_to_mime$extension == extension),]$mime_type
res <- simplemagic_mime_db[(simplemagic_mime_db$extension == extension),]$mime_type

if (length(res) == 0) return("???")

Expand Down
10 changes: 10 additions & 0 deletions R/simplemagic-package.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
#' Lightweight File 'MIME' Type Detection Based On Contents or Extension
#'
#' Provides a more portable/ligtweight alternative to the `wand` package.
#'
#' @section Some important details:
#'
#' The header checking is minimal (i.e. nowhere near as comprehensive as `libmagic`) but
#' covers quite a bit of ground. If there are content-check types from
#' [`magic sources`](https://github.com/threatstack/libmagic/tree/master/magic/)
#' that you would like coded into the package, please file an issue and
#' _include the full line(s)_ from that linked `magic.tab` that you would like mapped.
#'
#' @md
#' @name simplemagic
#' @docType package
Expand Down
6 changes: 4 additions & 2 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,17 @@ determined from "magic" bytes in file headers, file contents or intuited from
file extensions. Tools are provided to perform limited "magic" tests as well
as mapping 'MIME' types from a database of over 1,500 extension mappings.

Provides a more portable/ligtweight alternative to the `wand` package.

## SOME IMPORTANT DETAILS

The header checking is minimal (i.e. nowhere near as comprehensive as `libmagic`).
If there are content-check types from [`magic.tab`](https://opensource.apple.com/source/ksh/ksh-13/ksh/src/lib/libast/misc/magic.tab.auto.html) that you would like coded into the package, please file an issue and _include the full line(s)_ from that linked `magic.tab` that you would like mapped.
The header checking is minimal (i.e. nowhere near as comprehensive as `libmagic`) but covers quite a bit of ground. If there are content-check types from [`magic sources`](https://github.com/threatstack/libmagic/tree/master/magic/) that you would like coded into the package, please file an issue and _include the full line(s)_ from that linked `magic.tab` that you would like mapped.

## What's Inside The Tin

- `get_content_type`: Discover MIME type of a file based on contents
- `guess_content_type`: Guess MIME type from filename (extension)
- `simplemagic_mime_db`: File extension-to-MIME mapping data frame

The following functions are implemented:

Expand Down
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,14 @@ from file extensions. Tools are provided to perform limited “magic”
tests as well as mapping ‘MIME’ types from a database of over 1,500
extension mappings.

Provides a more portable/ligtweight alternative to the `wand` package.

## SOME IMPORTANT DETAILS

The header checking is minimal (i.e. nowhere near as comprehensive as
`libmagic`). If there are content-check types from
[`magic.tab`](https://opensource.apple.com/source/ksh/ksh-13/ksh/src/lib/libast/misc/magic.tab.auto.html)
`libmagic`) but covers quite a bit of ground. If there are content-check
types from [`magic
sources`](https://github.com/threatstack/libmagic/tree/master/magic/)
that you would like coded into the package, please file an issue and
*include the full line(s)* from that linked `magic.tab` that you would
like mapped.
Expand All @@ -24,6 +27,7 @@ like mapped.

- `get_content_type`: Discover MIME type of a file based on contents
- `guess_content_type`: Guess MIME type from filename (extension)
- `simplemagic_mime_db`: File extension-to-MIME mapping data frame

The following functions are implemented:

Expand Down Expand Up @@ -124,7 +128,7 @@ list.files(system.file("extdat", package="simplemagic"), full.names=TRUE) %>%
## 62 test.sh application/x-shar
## 63 test.sh text/x-script.sh
## 64 test.sh text/x-sh
## 65 test.tar application/x-tar
## 65 test.tar application/pax
## 66 test.tar.gz application/octet-stream
## 67 test.tar.gz application/x-compressed
## 68 test.tar.gz application/x-gzip
Expand Down
11 changes: 10 additions & 1 deletion man/get_content_type.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion man/guess_content_type.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 11 additions & 1 deletion man/simplemagic.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions man/simplemagic_mime_db.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit f56737d

Please sign in to comment.