Skip to content

Commit

Permalink
feat: improve content type detection
Browse files Browse the repository at this point in the history
allows to detect MS office files correctly
  • Loading branch information
LukasLohoff committed Sep 15, 2023
1 parent b1b44aa commit 4b754e3
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 12 deletions.
9 changes: 7 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@

<!-- Utils -->
<commons.io.version>2.13.0</commons.io.version>
<tika.core.version>2.9.0</tika.core.version>
<tika.version>2.9.0</tika.version>
<reflections.version>0.10.2</reflections.version>
<evo-inflector.version>1.3</evo-inflector.version>

Expand Down Expand Up @@ -511,7 +511,12 @@
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${tika.core.version}</version>
<version>${tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${tika.version}</version>
</dependency>

<!-- Jackson -->
Expand Down
5 changes: 5 additions & 0 deletions shogun-lib/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,11 @@
<artifactId>tika-core</artifactId>
</dependency>

<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard-package</artifactId>
</dependency>

<!-- Testing -->
<dependency>
<groupId>org.springframework</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,11 @@
import de.terrestris.shogun.properties.UploadProperties;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.io.FileUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.Tika;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tomcat.util.http.fileupload.InvalidFileNameException;
import org.apache.tomcat.util.http.fileupload.impl.InvalidContentTypeException;
import org.springframework.beans.factory.annotation.Autowired;
Expand Down Expand Up @@ -66,21 +65,23 @@ public void isValid(MultipartFile file) throws Exception {
this.verifyContentType(file);
}

public void verifyContentType(MultipartFile file) throws IOException, TikaException {
public void verifyContentType(MultipartFile file) throws IOException {
String contentType = file.getContentType();
String name = file.getName();
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
TikaConfig tika = new TikaConfig();
MediaType mediaType = tika.getDetector().detect(TikaInputStream.get(file.getBytes()), metadata);
if (!mediaType.toString().equals(contentType)) {
throw new IOException("Mediatype validation failed. Passed content type is " + contentType + " but detected mediatype is " + mediaType);

Tika tika = new Tika();
String detectedMediaType = tika.detect(TikaInputStream.get(file.getBytes()), metadata);

if (!StringUtils.equalsIgnoreCase(detectedMediaType, contentType)) {
throw new IOException("Media type validation failed. Passed content type is " + contentType + " but detected media type is " + detectedMediaType);
}
}

public void isValidType(String contentType) throws InvalidContentTypeException {
List<String> supportedContentTypes = getSupportedContentTypes();
boolean isMatch = PatternMatchUtils.simpleMatch(supportedContentTypes.toArray(new String[supportedContentTypes.size()]), contentType);
boolean isMatch = PatternMatchUtils.simpleMatch(supportedContentTypes.toArray(new String[0]), contentType);
if (!isMatch) {
log.warn("Unsupported content type {} for upload", contentType);
throw new InvalidContentTypeException("Unsupported content type for upload!");
Expand All @@ -90,7 +91,7 @@ public void isValidType(String contentType) throws InvalidContentTypeException {
public void isValidFileName(String fileName) throws InvalidFileNameException {
List<String> illegalCharacters = Arrays.asList("\\", "/", ":", "*", "?", "\"", "<", ">", "|", "\\0", "\\n");
if (illegalCharacters.stream().anyMatch(fileName::contains)) {
throw new InvalidFileNameException(fileName, "Filename contains illegal chracters. [\\, /, :, *, ?, \", <, >, |, \\0, \\n]");
throw new InvalidFileNameException(fileName, "Filename contains illegal characters. [\\, /, :, *, ?, \", <, >, |, \\0, \\n]");
}
}

Expand Down

0 comments on commit 4b754e3

Please sign in to comment.