diff --git a/pom.xml b/pom.xml index 9ecb2f2a..718183a8 100644 --- a/pom.xml +++ b/pom.xml @@ -109,11 +109,6 @@ slf4j-api 1.7.25 - - com.optimaize.languagedetector - language-detector - 0.6 - com.opencsv @@ -164,6 +159,13 @@ commons-cli 1.4 + + + + com.github.pemistahl + lingua + 1.1.0 + diff --git a/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java b/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java index 02e69926..7a6f7b67 100644 --- a/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java +++ b/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java @@ -1,16 +1,15 @@ package de.gwdg.metadataqa.api.util; -import com.google.common.base.Optional; -import com.optimaize.langdetect.LanguageDetector; -import com.optimaize.langdetect.LanguageDetectorBuilder; -import com.optimaize.langdetect.i18n.LdLocale; -import com.optimaize.langdetect.ngram.NgramExtractors; -import com.optimaize.langdetect.profiles.LanguageProfile; -import com.optimaize.langdetect.profiles.LanguageProfileReader; -import com.optimaize.langdetect.text.CommonTextObjectFactories; -import com.optimaize.langdetect.text.TextObjectFactory; +import com.github.pemistahl.lingua.api.IsoCode639_1; +import com.github.pemistahl.lingua.api.Language; +import com.github.pemistahl.lingua.api.LanguageDetector; +import com.github.pemistahl.lingua.api.LanguageDetectorBuilder; + import java.io.IOException; +import java.util.ArrayList; import java.util.List; +import java.util.SortedMap; +import java.util.stream.Collectors; /** * @@ -18,25 +17,21 @@ */ class QALanguageDetector { - private List languageProfiles; private LanguageDetector languageDetector; - private TextObjectFactory textObjectFactory; QALanguageDetector() throws IOException { - languageProfiles = new LanguageProfileReader().readAllBuiltIn(); - - //build language detector: languageDetector = LanguageDetectorBuilder - .create(NgramExtractors.standard()) - .withProfiles(languageProfiles) - .build(); + .fromAllSpokenLanguages() + .build(); + } - //create a text object factory - textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + public Language detect(String text) { + final Language detectedLanguage = languageDetector.detectLanguageOf(text); + return detectedLanguage; } - public Optional detect(String text) { - var textObject = textObjectFactory.forText(text); - return languageDetector.detect(textObject); + public SortedMap detectWithConfidence(String text) { + final SortedMap detectedLanguages = languageDetector.computeLanguageConfidenceValues(text); + return detectedLanguages; } } diff --git a/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java b/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java index e5ee0a8c..9c001a53 100644 --- a/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java +++ b/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java @@ -1,8 +1,9 @@ package de.gwdg.metadataqa.api.util; -import com.google.common.base.Optional; -import com.optimaize.langdetect.i18n.LdLocale; +import com.github.pemistahl.lingua.api.Language; import java.io.IOException; +import java.util.SortedMap; + import org.junit.After; import org.junit.AfterClass; import org.junit.Before; @@ -38,24 +39,25 @@ public void tearDown() { @Test public void constructionTest() throws IOException { QALanguageDetector languageDetector = new QALanguageDetector(); - Optional langs = languageDetector.detect("There are no plans to deprecate this class in the foreseeable future."); + Language langs = languageDetector.detect("There are no plans to deprecate this class in the foreseeable future."); assertNotNull(langs); - assertTrue(langs.isPresent()); - assertEquals("en", langs.get().getLanguage()); + assertEquals("en", langs.getIsoCode639_1().toString()); langs = languageDetector.detect("Der Literaturnobelpreis 2016 ging an den Musiker Bob Dylan. Mit dieser Entscheidung erkannte die Jury zum ersten Mal die literarische Qualität von Songtexten an. Nicht jeder fand das gut."); assertNotNull(langs); - assertTrue(langs.isPresent()); - assertEquals("de", langs.get().getLanguage()); + assertEquals("de", langs.getIsoCode639_1().toString()); langs = languageDetector.detect("Ég a napmelegtől a kopár szík sarja."); assertNotNull(langs); - assertTrue(langs.isPresent()); - assertEquals("hu", langs.get().getLanguage()); + assertEquals("hu", langs.getIsoCode639_1().toString()); langs = languageDetector.detect("1984."); assertNotNull(langs); - assertFalse(langs.isPresent()); + //assertFalse(langs); + + SortedMap confidences = languageDetector.detectWithConfidence("Der Literaturnobelpreis 2016 ging an den Musiker Bob Dylan. Mit dieser Entscheidung erkannte die Jury zum ersten Mal die literarische Qualität von Songtexten an. Nicht jeder fand das gut."); + assertNotNull(confidences); + assertEquals("de", confidences.firstKey().getIsoCode639_1().toString()); } }