Added Stanford NER

DeFacto · May 30, 2018 · b3d1821 · b3d1821
1 parent 172fd11
commit b3d1821
Show file tree

Hide file tree

Showing 36 changed files with 166,693 additions and 0 deletions.
diff --git a/data/dev.jsonl b/data/dev.jsonl
diff --git a/data/test.jsonl b/data/test.jsonl
diff --git a/data/train.jsonl b/data/train.jsonl
diff --git a/doc_retrieval.py b/doc_retrieval.py
@@ -0,0 +1,19 @@
+import os
+import jsonlines
+
+def getRelevantDocs(claim):
+	docs = []
+	print claim
+	temp = open("temp.txt","w+")
+	temp.write(claim)
+	temp.close()
+	cmd = "java -mx600m -cp ner/stanford-ner/stanford-ner.jar:ner/stanford-ner/lib/* edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier ner/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz -outputFormat tabbedEntities -textFile temp.txt > temp.tsv"
+	os.system(cmd)
+	temp = open("temp.tsv","r").readlines()
+	entities = []
+	for line in temp:
+		line = line.strip().split("\t")
+		if len(line)>1:
+			entities.append(line[0])
+	print entities
+	return docs
diff --git a/doc_retrieval.pyc b/doc_retrieval.pyc
diff --git a/ner/stanford-ner/LICENSE.txt b/ner/stanford-ner/LICENSE.txt
diff --git a/ner/stanford-ner/NERDemo.java b/ner/stanford-ner/NERDemo.java
@@ -0,0 +1,171 @@
+import edu.stanford.nlp.ie.AbstractSequenceClassifier;
+import edu.stanford.nlp.ie.crf.*;
+import edu.stanford.nlp.io.IOUtils;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
+import edu.stanford.nlp.util.Triple;
+
+import java.util.List;
+
+
+/** This is a demo of calling CRFClassifier programmatically.
+ *  <p>
+ *  Usage: {@code java -mx400m -cp "*" NERDemo [serializedClassifier [fileName]] }
+ *  <p>
+ *  If arguments aren't specified, they default to
+ *  classifiers/english.all.3class.distsim.crf.ser.gz and some hardcoded sample text.
+ *  If run with arguments, it shows some of the ways to get k-best labelings and
+ *  probabilities out with CRFClassifier. If run without arguments, it shows some of
+ *  the alternative output formats that you can get.
+ *  <p>
+ *  To use CRFClassifier from the command line:
+ *  </p><blockquote>
+ *  {@code java -mx400m edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier [classifier] -textFile [file] }
+ *  </blockquote><p>
+ *  Or if the file is already tokenized and one word per line, perhaps in
+ *  a tab-separated value format with extra columns for part-of-speech tag,
+ *  etc., use the version below (note the 's' instead of the 'x'):
+ *  </p><blockquote>
+ *  {@code java -mx400m edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier [classifier] -testFile [file] }
+ *  </blockquote>
+ *
+ *  @author Jenny Finkel
+ *  @author Christopher Manning
+ */
+
+public class NERDemo {
+
+  public static void main(String[] args) throws Exception {
+
+    String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";
+
+    if (args.length > 0) {
+      serializedClassifier = args[0];
+    }
+
+    AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifier(serializedClassifier);
+
+    /* For either a file to annotate or for the hardcoded text example, this
+       demo file shows several ways to process the input, for teaching purposes.
+    */
+
+    if (args.length > 1) {
+
+      /* For the file, it shows (1) how to run NER on a String, (2) how
+         to get the entities in the String with character offsets, and
+         (3) how to run NER on a whole file (without loading it into a String).
+      */
+
+      String fileContents = IOUtils.slurpFile(args[1]);
+      List<List<CoreLabel>> out = classifier.classify(fileContents);
+      for (List<CoreLabel> sentence : out) {
+        for (CoreLabel word : sentence) {
+          System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
+        }
+        System.out.println();
+      }
+
+      System.out.println("---");
+      out = classifier.classifyFile(args[1]);
+      for (List<CoreLabel> sentence : out) {
+        for (CoreLabel word : sentence) {
+          System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
+        }
+        System.out.println();
+      }
+
+      System.out.println("---");
+      List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(fileContents);
+      for (Triple<String, Integer, Integer> item : list) {
+        System.out.println(item.first() + ": " + fileContents.substring(item.second(), item.third()));
+      }
+      System.out.println("---");
+      System.out.println("Ten best entity labelings");
+      DocumentReaderAndWriter<CoreLabel> readerAndWriter = classifier.makePlainTextReaderAndWriter();
+      classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter);
+
+      System.out.println("---");
+      System.out.println("Per-token marginalized probabilities");
+      classifier.printProbs(args[1], readerAndWriter);
+
+      // -- This code prints out the first order (token pair) clique probabilities.
+      // -- But that output is a bit overwhelming, so we leave it commented out by default.
+      // System.out.println("---");
+      // System.out.println("First Order Clique Probabilities");
+      // ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter);
+
+    } else {
+
+      /* For the hard-coded String, it shows how to run it on a single
+         sentence, and how to do this and produce several formats, including
+         slash tags and an inline XML output format. It also shows the full
+         contents of the {@code CoreLabel}s that are constructed by the
+         classifier. And it shows getting out the probabilities of different
+         assignments and an n-best list of classifications with probabilities.
+      */
+
+      String[] example = {"Good afternoon Rajat Raina, how are you today?",
+                          "I go to school at Stanford University, which is located in California." };
+      for (String str : example) {
+        System.out.println(classifier.classifyToString(str));
+      }
+      System.out.println("---");
+
+      for (String str : example) {
+        // This one puts in spaces and newlines between tokens, so just print not println.
+        System.out.print(classifier.classifyToString(str, "slashTags", false));
+      }
+      System.out.println("---");
+
+      for (String str : example) {
+        // This one is best for dealing with the output as a TSV (tab-separated column) file.
+        // The first column gives entities, the second their classes, and the third the remaining text in a document
+        System.out.print(classifier.classifyToString(str, "tabbedEntities", false));
+      }
+      System.out.println("---");
+
+      for (String str : example) {
+        System.out.println(classifier.classifyWithInlineXML(str));
+      }
+      System.out.println("---");
+
+      for (String str : example) {
+        System.out.println(classifier.classifyToString(str, "xml", true));
+      }
+      System.out.println("---");
+
+      for (String str : example) {
+        System.out.print(classifier.classifyToString(str, "tsv", false));
+      }
+      System.out.println("---");
+
+      // This gets out entities with character offsets
+      int j = 0;
+      for (String str : example) {
+        j++;
+        List<Triple<String,Integer,Integer>> triples = classifier.classifyToCharacterOffsets(str);
+        for (Triple<String,Integer,Integer> trip : triples) {
+          System.out.printf("%s over character offsets [%d, %d) in sentence %d.%n",
+                  trip.first(), trip.second(), trip.third, j);
+        }
+      }
+      System.out.println("---");
+
+      // This prints out all the details of what is stored for each token
+      int i=0;
+      for (String str : example) {
+        for (List<CoreLabel> lcl : classifier.classify(str)) {
+          for (CoreLabel cl : lcl) {
+            System.out.print(i++ + ": ");
+            System.out.println(cl.toShorterString());
+          }
+        }
+      }
+
+      System.out.println("---");
+
+    }
+  }
+
+}