Skip to content

Commit

Permalink
Added Stanford NER
Browse files Browse the repository at this point in the history
  • Loading branch information
anikethjr committed May 30, 2018
1 parent 172fd11 commit b3d1821
Show file tree
Hide file tree
Showing 36 changed files with 166,693 additions and 0 deletions.
9,999 changes: 9,999 additions & 0 deletions data/dev.jsonl

Large diffs are not rendered by default.

9,999 changes: 9,999 additions & 0 deletions data/test.jsonl

Large diffs are not rendered by default.

145,449 changes: 145,449 additions & 0 deletions data/train.jsonl

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions doc_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import os
import jsonlines

def getRelevantDocs(claim):
docs = []
print claim
temp = open("temp.txt","w+")
temp.write(claim)
temp.close()
cmd = "java -mx600m -cp ner/stanford-ner/stanford-ner.jar:ner/stanford-ner/lib/* edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier ner/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz -outputFormat tabbedEntities -textFile temp.txt > temp.tsv"
os.system(cmd)
temp = open("temp.tsv","r").readlines()
entities = []
for line in temp:
line = line.strip().split("\t")
if len(line)>1:
entities.append(line[0])
print entities
return docs
Binary file added doc_retrieval.pyc
Binary file not shown.
339 changes: 339 additions & 0 deletions ner/stanford-ner/LICENSE.txt

Large diffs are not rendered by default.

171 changes: 171 additions & 0 deletions ner/stanford-ner/NERDemo.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.*;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
import edu.stanford.nlp.util.Triple;

import java.util.List;


/** This is a demo of calling CRFClassifier programmatically.
* <p>
* Usage: {@code java -mx400m -cp "*" NERDemo [serializedClassifier [fileName]] }
* <p>
* If arguments aren't specified, they default to
* classifiers/english.all.3class.distsim.crf.ser.gz and some hardcoded sample text.
* If run with arguments, it shows some of the ways to get k-best labelings and
* probabilities out with CRFClassifier. If run without arguments, it shows some of
* the alternative output formats that you can get.
* <p>
* To use CRFClassifier from the command line:
* </p><blockquote>
* {@code java -mx400m edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier [classifier] -textFile [file] }
* </blockquote><p>
* Or if the file is already tokenized and one word per line, perhaps in
* a tab-separated value format with extra columns for part-of-speech tag,
* etc., use the version below (note the 's' instead of the 'x'):
* </p><blockquote>
* {@code java -mx400m edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier [classifier] -testFile [file] }
* </blockquote>
*
* @author Jenny Finkel
* @author Christopher Manning
*/

public class NERDemo {

public static void main(String[] args) throws Exception {

String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";

if (args.length > 0) {
serializedClassifier = args[0];
}

AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifier(serializedClassifier);

/* For either a file to annotate or for the hardcoded text example, this
demo file shows several ways to process the input, for teaching purposes.
*/

if (args.length > 1) {

/* For the file, it shows (1) how to run NER on a String, (2) how
to get the entities in the String with character offsets, and
(3) how to run NER on a whole file (without loading it into a String).
*/

String fileContents = IOUtils.slurpFile(args[1]);
List<List<CoreLabel>> out = classifier.classify(fileContents);
for (List<CoreLabel> sentence : out) {
for (CoreLabel word : sentence) {
System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
}
System.out.println();
}

System.out.println("---");
out = classifier.classifyFile(args[1]);
for (List<CoreLabel> sentence : out) {
for (CoreLabel word : sentence) {
System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
}
System.out.println();
}

System.out.println("---");
List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(fileContents);
for (Triple<String, Integer, Integer> item : list) {
System.out.println(item.first() + ": " + fileContents.substring(item.second(), item.third()));
}
System.out.println("---");
System.out.println("Ten best entity labelings");
DocumentReaderAndWriter<CoreLabel> readerAndWriter = classifier.makePlainTextReaderAndWriter();
classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter);

System.out.println("---");
System.out.println("Per-token marginalized probabilities");
classifier.printProbs(args[1], readerAndWriter);

// -- This code prints out the first order (token pair) clique probabilities.
// -- But that output is a bit overwhelming, so we leave it commented out by default.
// System.out.println("---");
// System.out.println("First Order Clique Probabilities");
// ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter);

} else {

/* For the hard-coded String, it shows how to run it on a single
sentence, and how to do this and produce several formats, including
slash tags and an inline XML output format. It also shows the full
contents of the {@code CoreLabel}s that are constructed by the
classifier. And it shows getting out the probabilities of different
assignments and an n-best list of classifications with probabilities.
*/

String[] example = {"Good afternoon Rajat Raina, how are you today?",
"I go to school at Stanford University, which is located in California." };
for (String str : example) {
System.out.println(classifier.classifyToString(str));
}
System.out.println("---");

for (String str : example) {
// This one puts in spaces and newlines between tokens, so just print not println.
System.out.print(classifier.classifyToString(str, "slashTags", false));
}
System.out.println("---");

for (String str : example) {
// This one is best for dealing with the output as a TSV (tab-separated column) file.
// The first column gives entities, the second their classes, and the third the remaining text in a document
System.out.print(classifier.classifyToString(str, "tabbedEntities", false));
}
System.out.println("---");

for (String str : example) {
System.out.println(classifier.classifyWithInlineXML(str));
}
System.out.println("---");

for (String str : example) {
System.out.println(classifier.classifyToString(str, "xml", true));
}
System.out.println("---");

for (String str : example) {
System.out.print(classifier.classifyToString(str, "tsv", false));
}
System.out.println("---");

// This gets out entities with character offsets
int j = 0;
for (String str : example) {
j++;
List<Triple<String,Integer,Integer>> triples = classifier.classifyToCharacterOffsets(str);
for (Triple<String,Integer,Integer> trip : triples) {
System.out.printf("%s over character offsets [%d, %d) in sentence %d.%n",
trip.first(), trip.second(), trip.third, j);
}
}
System.out.println("---");

// This prints out all the details of what is stored for each token
int i=0;
for (String str : example) {
for (List<CoreLabel> lcl : classifier.classify(str)) {
for (CoreLabel cl : lcl) {
System.out.print(i++ + ": ");
System.out.println(cl.toShorterString());
}
}
}

System.out.println("---");

}
}

}
Loading

0 comments on commit b3d1821

Please sign in to comment.