diff --git a/pom.xml b/pom.xml
index 9ecb2f2a..718183a8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -109,11 +109,6 @@
slf4j-api
1.7.25
-
- com.optimaize.languagedetector
- language-detector
- 0.6
-
com.opencsv
@@ -164,6 +159,13 @@
commons-cli
1.4
+
+
+
+ com.github.pemistahl
+ lingua
+ 1.1.0
+
diff --git a/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java b/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java
index 02e69926..7a6f7b67 100644
--- a/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java
+++ b/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java
@@ -1,16 +1,15 @@
package de.gwdg.metadataqa.api.util;
-import com.google.common.base.Optional;
-import com.optimaize.langdetect.LanguageDetector;
-import com.optimaize.langdetect.LanguageDetectorBuilder;
-import com.optimaize.langdetect.i18n.LdLocale;
-import com.optimaize.langdetect.ngram.NgramExtractors;
-import com.optimaize.langdetect.profiles.LanguageProfile;
-import com.optimaize.langdetect.profiles.LanguageProfileReader;
-import com.optimaize.langdetect.text.CommonTextObjectFactories;
-import com.optimaize.langdetect.text.TextObjectFactory;
+import com.github.pemistahl.lingua.api.IsoCode639_1;
+import com.github.pemistahl.lingua.api.Language;
+import com.github.pemistahl.lingua.api.LanguageDetector;
+import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
+
import java.io.IOException;
+import java.util.ArrayList;
import java.util.List;
+import java.util.SortedMap;
+import java.util.stream.Collectors;
/**
*
@@ -18,25 +17,21 @@
*/
class QALanguageDetector {
- private List languageProfiles;
private LanguageDetector languageDetector;
- private TextObjectFactory textObjectFactory;
QALanguageDetector() throws IOException {
- languageProfiles = new LanguageProfileReader().readAllBuiltIn();
-
- //build language detector:
languageDetector = LanguageDetectorBuilder
- .create(NgramExtractors.standard())
- .withProfiles(languageProfiles)
- .build();
+ .fromAllSpokenLanguages()
+ .build();
+ }
- //create a text object factory
- textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
+ public Language detect(String text) {
+ final Language detectedLanguage = languageDetector.detectLanguageOf(text);
+ return detectedLanguage;
}
- public Optional detect(String text) {
- var textObject = textObjectFactory.forText(text);
- return languageDetector.detect(textObject);
+ public SortedMap detectWithConfidence(String text) {
+ final SortedMap detectedLanguages = languageDetector.computeLanguageConfidenceValues(text);
+ return detectedLanguages;
}
}
diff --git a/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java b/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java
index e5ee0a8c..9c001a53 100644
--- a/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java
+++ b/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java
@@ -1,8 +1,9 @@
package de.gwdg.metadataqa.api.util;
-import com.google.common.base.Optional;
-import com.optimaize.langdetect.i18n.LdLocale;
+import com.github.pemistahl.lingua.api.Language;
import java.io.IOException;
+import java.util.SortedMap;
+
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
@@ -38,24 +39,25 @@ public void tearDown() {
@Test
public void constructionTest() throws IOException {
QALanguageDetector languageDetector = new QALanguageDetector();
- Optional langs = languageDetector.detect("There are no plans to deprecate this class in the foreseeable future.");
+ Language langs = languageDetector.detect("There are no plans to deprecate this class in the foreseeable future.");
assertNotNull(langs);
- assertTrue(langs.isPresent());
- assertEquals("en", langs.get().getLanguage());
+ assertEquals("en", langs.getIsoCode639_1().toString());
langs = languageDetector.detect("Der Literaturnobelpreis 2016 ging an den Musiker Bob Dylan. Mit dieser Entscheidung erkannte die Jury zum ersten Mal die literarische Qualität von Songtexten an. Nicht jeder fand das gut.");
assertNotNull(langs);
- assertTrue(langs.isPresent());
- assertEquals("de", langs.get().getLanguage());
+ assertEquals("de", langs.getIsoCode639_1().toString());
langs = languageDetector.detect("Ég a napmelegtől a kopár szík sarja.");
assertNotNull(langs);
- assertTrue(langs.isPresent());
- assertEquals("hu", langs.get().getLanguage());
+ assertEquals("hu", langs.getIsoCode639_1().toString());
langs = languageDetector.detect("1984.");
assertNotNull(langs);
- assertFalse(langs.isPresent());
+ //assertFalse(langs);
+
+ SortedMap confidences = languageDetector.detectWithConfidence("Der Literaturnobelpreis 2016 ging an den Musiker Bob Dylan. Mit dieser Entscheidung erkannte die Jury zum ersten Mal die literarische Qualität von Songtexten an. Nicht jeder fand das gut.");
+ assertNotNull(confidences);
+ assertEquals("de", confidences.firstKey().getIsoCode639_1().toString());
}
}