From 253313772d2c3635e231a323c78d01be9ed26d0d Mon Sep 17 00:00:00 2001 From: Miel Vander Sande Date: Tue, 6 Jul 2021 11:25:11 +0200 Subject: [PATCH 1/2] Swaps language detection package --- pom.xml | 12 ++++--- .../api/util/QALanguageDetector.java | 35 +++++++------------ .../api/util/QALanguageDetectorTest.java | 16 ++++----- 3 files changed, 25 insertions(+), 38 deletions(-) diff --git a/pom.xml b/pom.xml index 9ecb2f2a..718183a8 100644 --- a/pom.xml +++ b/pom.xml @@ -109,11 +109,6 @@ slf4j-api 1.7.25 - - com.optimaize.languagedetector - language-detector - 0.6 - com.opencsv @@ -164,6 +159,13 @@ commons-cli 1.4 + + + + com.github.pemistahl + lingua + 1.1.0 + diff --git a/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java b/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java index 02e69926..81cc0e0b 100644 --- a/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java +++ b/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java @@ -1,16 +1,14 @@ package de.gwdg.metadataqa.api.util; -import com.google.common.base.Optional; -import com.optimaize.langdetect.LanguageDetector; -import com.optimaize.langdetect.LanguageDetectorBuilder; -import com.optimaize.langdetect.i18n.LdLocale; -import com.optimaize.langdetect.ngram.NgramExtractors; -import com.optimaize.langdetect.profiles.LanguageProfile; -import com.optimaize.langdetect.profiles.LanguageProfileReader; -import com.optimaize.langdetect.text.CommonTextObjectFactories; -import com.optimaize.langdetect.text.TextObjectFactory; +import com.github.pemistahl.lingua.api.IsoCode639_1; +import com.github.pemistahl.lingua.api.Language; +import com.github.pemistahl.lingua.api.LanguageDetector; +import com.github.pemistahl.lingua.api.LanguageDetectorBuilder; + import java.io.IOException; +import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; /** * @@ -18,25 +16,16 @@ */ class QALanguageDetector { - private List languageProfiles; private LanguageDetector languageDetector; - private TextObjectFactory textObjectFactory; QALanguageDetector() throws IOException { - languageProfiles = new LanguageProfileReader().readAllBuiltIn(); - - //build language detector: languageDetector = LanguageDetectorBuilder - .create(NgramExtractors.standard()) - .withProfiles(languageProfiles) - .build(); - - //create a text object factory - textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + .fromAllSpokenLanguages() + .build(); } - public Optional detect(String text) { - var textObject = textObjectFactory.forText(text); - return languageDetector.detect(textObject); + public Language detect(String text) { + final Language detectedLanguage = languageDetector.detectLanguageOf(text); + return detectedLanguage; } } diff --git a/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java b/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java index e5ee0a8c..df39bbe5 100644 --- a/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java +++ b/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java @@ -1,7 +1,6 @@ package de.gwdg.metadataqa.api.util; -import com.google.common.base.Optional; -import com.optimaize.langdetect.i18n.LdLocale; +import com.github.pemistahl.lingua.api.Language; import java.io.IOException; import org.junit.After; import org.junit.AfterClass; @@ -38,24 +37,21 @@ public void tearDown() { @Test public void constructionTest() throws IOException { QALanguageDetector languageDetector = new QALanguageDetector(); - Optional langs = languageDetector.detect("There are no plans to deprecate this class in the foreseeable future."); + Language langs = languageDetector.detect("There are no plans to deprecate this class in the foreseeable future."); assertNotNull(langs); - assertTrue(langs.isPresent()); - assertEquals("en", langs.get().getLanguage()); + assertEquals("en", langs.getIsoCode639_1().toString()); langs = languageDetector.detect("Der Literaturnobelpreis 2016 ging an den Musiker Bob Dylan. Mit dieser Entscheidung erkannte die Jury zum ersten Mal die literarische Qualität von Songtexten an. Nicht jeder fand das gut."); assertNotNull(langs); - assertTrue(langs.isPresent()); - assertEquals("de", langs.get().getLanguage()); + assertEquals("de", langs.getIsoCode639_1().toString()); langs = languageDetector.detect("Ég a napmelegtől a kopár szík sarja."); assertNotNull(langs); - assertTrue(langs.isPresent()); - assertEquals("hu", langs.get().getLanguage()); + assertEquals("hu", langs.getIsoCode639_1().toString()); langs = languageDetector.detect("1984."); assertNotNull(langs); - assertFalse(langs.isPresent()); + //assertFalse(langs); } } From e55d7703d9237f618530cfec5a9f6cbddeae1be5 Mon Sep 17 00:00:00 2001 From: Miel Vander Sande Date: Tue, 6 Jul 2021 11:32:21 +0200 Subject: [PATCH 2/2] Add confidences method to language detection --- .../de/gwdg/metadataqa/api/util/QALanguageDetector.java | 6 ++++++ .../de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java b/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java index 81cc0e0b..7a6f7b67 100644 --- a/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java +++ b/src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java @@ -8,6 +8,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.SortedMap; import java.util.stream.Collectors; /** @@ -28,4 +29,9 @@ public Language detect(String text) { final Language detectedLanguage = languageDetector.detectLanguageOf(text); return detectedLanguage; } + + public SortedMap detectWithConfidence(String text) { + final SortedMap detectedLanguages = languageDetector.computeLanguageConfidenceValues(text); + return detectedLanguages; + } } diff --git a/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java b/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java index df39bbe5..9c001a53 100644 --- a/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java +++ b/src/test/java/de/gwdg/metadataqa/api/util/QALanguageDetectorTest.java @@ -2,6 +2,8 @@ import com.github.pemistahl.lingua.api.Language; import java.io.IOException; +import java.util.SortedMap; + import org.junit.After; import org.junit.AfterClass; import org.junit.Before; @@ -53,5 +55,9 @@ public void constructionTest() throws IOException { assertNotNull(langs); //assertFalse(langs); + SortedMap confidences = languageDetector.detectWithConfidence("Der Literaturnobelpreis 2016 ging an den Musiker Bob Dylan. Mit dieser Entscheidung erkannte die Jury zum ersten Mal die literarische Qualität von Songtexten an. Nicht jeder fand das gut."); + assertNotNull(confidences); + assertEquals("de", confidences.firstKey().getIsoCode639_1().toString()); + } }