Skip to content

Commit

Permalink
Merge pull request #72 from mielvds/language-detection
Browse files Browse the repository at this point in the history
Replace Language detection library with the more accurate lingua
  • Loading branch information
pkiraly authored Jul 6, 2021
2 parents f36c133 + e55d770 commit 284573d
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 37 deletions.
12 changes: 7 additions & 5 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,6 @@
<artifactId>slf4j-api</artifactId>
<version>1.7.25</version>
</dependency>
<dependency>
<groupId>com.optimaize.languagedetector</groupId>
<artifactId>language-detector</artifactId>
<version>0.6</version>
</dependency>
<!-- CSV reader. Docs: http://opencsv.sourceforge.net -->
<dependency>
<groupId>com.opencsv</groupId>
Expand Down Expand Up @@ -164,6 +159,13 @@
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>

<!-- language detection -->
<dependency>
<groupId>com.github.pemistahl</groupId>
<artifactId>lingua</artifactId>
<version>1.1.0</version>
</dependency>
</dependencies>

<build>
Expand Down
39 changes: 17 additions & 22 deletions src/main/java/de/gwdg/metadataqa/api/util/QALanguageDetector.java
Original file line number Diff line number Diff line change
@@ -1,42 +1,37 @@
package de.gwdg.metadataqa.api.util;

import com.google.common.base.Optional;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObjectFactory;
import com.github.pemistahl.lingua.api.IsoCode639_1;
import com.github.pemistahl.lingua.api.Language;
import com.github.pemistahl.lingua.api.LanguageDetector;
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.SortedMap;
import java.util.stream.Collectors;

/**
*
* @author Péter Király <peter.kiraly at gwdg.de>
*/
class QALanguageDetector {

private List<LanguageProfile> languageProfiles;
private LanguageDetector languageDetector;
private TextObjectFactory textObjectFactory;

QALanguageDetector() throws IOException {
languageProfiles = new LanguageProfileReader().readAllBuiltIn();

//build language detector:
languageDetector = LanguageDetectorBuilder
.create(NgramExtractors.standard())
.withProfiles(languageProfiles)
.build();
.fromAllSpokenLanguages()
.build();
}

//create a text object factory
textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
public Language detect(String text) {
final Language detectedLanguage = languageDetector.detectLanguageOf(text);
return detectedLanguage;
}

public Optional<LdLocale> detect(String text) {
var textObject = textObjectFactory.forText(text);
return languageDetector.detect(textObject);
public SortedMap<Language, Double> detectWithConfidence(String text) {
final SortedMap<Language, Double> detectedLanguages = languageDetector.computeLanguageConfidenceValues(text);
return detectedLanguages;
}
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
package de.gwdg.metadataqa.api.util;

import com.google.common.base.Optional;
import com.optimaize.langdetect.i18n.LdLocale;
import com.github.pemistahl.lingua.api.Language;
import java.io.IOException;
import java.util.SortedMap;

import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
Expand Down Expand Up @@ -38,24 +39,25 @@ public void tearDown() {
@Test
public void constructionTest() throws IOException {
QALanguageDetector languageDetector = new QALanguageDetector();
Optional<LdLocale> langs = languageDetector.detect("There are no plans to deprecate this class in the foreseeable future.");
Language langs = languageDetector.detect("There are no plans to deprecate this class in the foreseeable future.");
assertNotNull(langs);
assertTrue(langs.isPresent());
assertEquals("en", langs.get().getLanguage());
assertEquals("en", langs.getIsoCode639_1().toString());

langs = languageDetector.detect("Der Literaturnobelpreis 2016 ging an den Musiker Bob Dylan. Mit dieser Entscheidung erkannte die Jury zum ersten Mal die literarische Qualität von Songtexten an. Nicht jeder fand das gut.");
assertNotNull(langs);
assertTrue(langs.isPresent());
assertEquals("de", langs.get().getLanguage());
assertEquals("de", langs.getIsoCode639_1().toString());

langs = languageDetector.detect("Ég a napmelegtől a kopár szík sarja.");
assertNotNull(langs);
assertTrue(langs.isPresent());
assertEquals("hu", langs.get().getLanguage());
assertEquals("hu", langs.getIsoCode639_1().toString());

langs = languageDetector.detect("1984.");
assertNotNull(langs);
assertFalse(langs.isPresent());
//assertFalse(langs);

SortedMap<Language, Double> confidences = languageDetector.detectWithConfidence("Der Literaturnobelpreis 2016 ging an den Musiker Bob Dylan. Mit dieser Entscheidung erkannte die Jury zum ersten Mal die literarische Qualität von Songtexten an. Nicht jeder fand das gut.");
assertNotNull(confidences);
assertEquals("de", confidences.firstKey().getIsoCode639_1().toString());

}
}

0 comments on commit 284573d

Please sign in to comment.