-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
36 changed files
with
166,693 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import os | ||
import jsonlines | ||
|
||
def getRelevantDocs(claim): | ||
docs = [] | ||
print claim | ||
temp = open("temp.txt","w+") | ||
temp.write(claim) | ||
temp.close() | ||
cmd = "java -mx600m -cp ner/stanford-ner/stanford-ner.jar:ner/stanford-ner/lib/* edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier ner/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz -outputFormat tabbedEntities -textFile temp.txt > temp.tsv" | ||
os.system(cmd) | ||
temp = open("temp.tsv","r").readlines() | ||
entities = [] | ||
for line in temp: | ||
line = line.strip().split("\t") | ||
if len(line)>1: | ||
entities.append(line[0]) | ||
print entities | ||
return docs |
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
import edu.stanford.nlp.ie.AbstractSequenceClassifier; | ||
import edu.stanford.nlp.ie.crf.*; | ||
import edu.stanford.nlp.io.IOUtils; | ||
import edu.stanford.nlp.ling.CoreLabel; | ||
import edu.stanford.nlp.ling.CoreAnnotations; | ||
import edu.stanford.nlp.sequences.DocumentReaderAndWriter; | ||
import edu.stanford.nlp.util.Triple; | ||
|
||
import java.util.List; | ||
|
||
|
||
/** This is a demo of calling CRFClassifier programmatically. | ||
* <p> | ||
* Usage: {@code java -mx400m -cp "*" NERDemo [serializedClassifier [fileName]] } | ||
* <p> | ||
* If arguments aren't specified, they default to | ||
* classifiers/english.all.3class.distsim.crf.ser.gz and some hardcoded sample text. | ||
* If run with arguments, it shows some of the ways to get k-best labelings and | ||
* probabilities out with CRFClassifier. If run without arguments, it shows some of | ||
* the alternative output formats that you can get. | ||
* <p> | ||
* To use CRFClassifier from the command line: | ||
* </p><blockquote> | ||
* {@code java -mx400m edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier [classifier] -textFile [file] } | ||
* </blockquote><p> | ||
* Or if the file is already tokenized and one word per line, perhaps in | ||
* a tab-separated value format with extra columns for part-of-speech tag, | ||
* etc., use the version below (note the 's' instead of the 'x'): | ||
* </p><blockquote> | ||
* {@code java -mx400m edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier [classifier] -testFile [file] } | ||
* </blockquote> | ||
* | ||
* @author Jenny Finkel | ||
* @author Christopher Manning | ||
*/ | ||
|
||
public class NERDemo { | ||
|
||
public static void main(String[] args) throws Exception { | ||
|
||
String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz"; | ||
|
||
if (args.length > 0) { | ||
serializedClassifier = args[0]; | ||
} | ||
|
||
AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifier(serializedClassifier); | ||
|
||
/* For either a file to annotate or for the hardcoded text example, this | ||
demo file shows several ways to process the input, for teaching purposes. | ||
*/ | ||
|
||
if (args.length > 1) { | ||
|
||
/* For the file, it shows (1) how to run NER on a String, (2) how | ||
to get the entities in the String with character offsets, and | ||
(3) how to run NER on a whole file (without loading it into a String). | ||
*/ | ||
|
||
String fileContents = IOUtils.slurpFile(args[1]); | ||
List<List<CoreLabel>> out = classifier.classify(fileContents); | ||
for (List<CoreLabel> sentence : out) { | ||
for (CoreLabel word : sentence) { | ||
System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); | ||
} | ||
System.out.println(); | ||
} | ||
|
||
System.out.println("---"); | ||
out = classifier.classifyFile(args[1]); | ||
for (List<CoreLabel> sentence : out) { | ||
for (CoreLabel word : sentence) { | ||
System.out.print(word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); | ||
} | ||
System.out.println(); | ||
} | ||
|
||
System.out.println("---"); | ||
List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(fileContents); | ||
for (Triple<String, Integer, Integer> item : list) { | ||
System.out.println(item.first() + ": " + fileContents.substring(item.second(), item.third())); | ||
} | ||
System.out.println("---"); | ||
System.out.println("Ten best entity labelings"); | ||
DocumentReaderAndWriter<CoreLabel> readerAndWriter = classifier.makePlainTextReaderAndWriter(); | ||
classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter); | ||
|
||
System.out.println("---"); | ||
System.out.println("Per-token marginalized probabilities"); | ||
classifier.printProbs(args[1], readerAndWriter); | ||
|
||
// -- This code prints out the first order (token pair) clique probabilities. | ||
// -- But that output is a bit overwhelming, so we leave it commented out by default. | ||
// System.out.println("---"); | ||
// System.out.println("First Order Clique Probabilities"); | ||
// ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter); | ||
|
||
} else { | ||
|
||
/* For the hard-coded String, it shows how to run it on a single | ||
sentence, and how to do this and produce several formats, including | ||
slash tags and an inline XML output format. It also shows the full | ||
contents of the {@code CoreLabel}s that are constructed by the | ||
classifier. And it shows getting out the probabilities of different | ||
assignments and an n-best list of classifications with probabilities. | ||
*/ | ||
|
||
String[] example = {"Good afternoon Rajat Raina, how are you today?", | ||
"I go to school at Stanford University, which is located in California." }; | ||
for (String str : example) { | ||
System.out.println(classifier.classifyToString(str)); | ||
} | ||
System.out.println("---"); | ||
|
||
for (String str : example) { | ||
// This one puts in spaces and newlines between tokens, so just print not println. | ||
System.out.print(classifier.classifyToString(str, "slashTags", false)); | ||
} | ||
System.out.println("---"); | ||
|
||
for (String str : example) { | ||
// This one is best for dealing with the output as a TSV (tab-separated column) file. | ||
// The first column gives entities, the second their classes, and the third the remaining text in a document | ||
System.out.print(classifier.classifyToString(str, "tabbedEntities", false)); | ||
} | ||
System.out.println("---"); | ||
|
||
for (String str : example) { | ||
System.out.println(classifier.classifyWithInlineXML(str)); | ||
} | ||
System.out.println("---"); | ||
|
||
for (String str : example) { | ||
System.out.println(classifier.classifyToString(str, "xml", true)); | ||
} | ||
System.out.println("---"); | ||
|
||
for (String str : example) { | ||
System.out.print(classifier.classifyToString(str, "tsv", false)); | ||
} | ||
System.out.println("---"); | ||
|
||
// This gets out entities with character offsets | ||
int j = 0; | ||
for (String str : example) { | ||
j++; | ||
List<Triple<String,Integer,Integer>> triples = classifier.classifyToCharacterOffsets(str); | ||
for (Triple<String,Integer,Integer> trip : triples) { | ||
System.out.printf("%s over character offsets [%d, %d) in sentence %d.%n", | ||
trip.first(), trip.second(), trip.third, j); | ||
} | ||
} | ||
System.out.println("---"); | ||
|
||
// This prints out all the details of what is stored for each token | ||
int i=0; | ||
for (String str : example) { | ||
for (List<CoreLabel> lcl : classifier.classify(str)) { | ||
for (CoreLabel cl : lcl) { | ||
System.out.print(i++ + ": "); | ||
System.out.println(cl.toShorterString()); | ||
} | ||
} | ||
} | ||
|
||
System.out.println("---"); | ||
|
||
} | ||
} | ||
|
||
} |
Oops, something went wrong.