From a307d4d377d2bf9b6c34e8dee0fb26e91afe42de Mon Sep 17 00:00:00 2001 From: Evan Dempsey Date: Thu, 1 Aug 2013 18:42:17 +0100 Subject: [PATCH] code dump --- .gitignore | 71 ++ .../bin/docsum/summarizer/stoplist.txt | 678 ++++++++++++++++++ .../src/docsum/algorithm/HITSAlgorithm.java | 216 ++++++ .../src/docsum/algorithm/HITSNode.java | 67 ++ .../src/docsum/algorithm/IndexValuePair.java | 18 + .../docsum/algorithm/KeywordAlgorithm.java | 19 + .../src/docsum/algorithm/MeadAlgorithm.java | 354 +++++++++ .../algorithm/SummarizationAlgorithm.java | 20 + .../docsum/summarizer/DocumentSummarizer.java | 79 ++ .../docsum/summarizer/KeywordExtractor.java | 73 ++ .../summarizer/SentencePreprocessor.java | 153 ++++ .../docsum/summarizer/SentenceSegmenter.java | 95 +++ .../src/docsum/summarizer/stoplist.txt | 678 ++++++++++++++++++ .../src/docsum/ui/GraphicalInterface.java | 557 ++++++++++++++ 14 files changed, 3078 insertions(+) create mode 100644 .gitignore create mode 100644 DocumentSummarizer/bin/docsum/summarizer/stoplist.txt create mode 100644 DocumentSummarizer/src/docsum/algorithm/HITSAlgorithm.java create mode 100644 DocumentSummarizer/src/docsum/algorithm/HITSNode.java create mode 100644 DocumentSummarizer/src/docsum/algorithm/IndexValuePair.java create mode 100644 DocumentSummarizer/src/docsum/algorithm/KeywordAlgorithm.java create mode 100644 DocumentSummarizer/src/docsum/algorithm/MeadAlgorithm.java create mode 100644 DocumentSummarizer/src/docsum/algorithm/SummarizationAlgorithm.java create mode 100644 DocumentSummarizer/src/docsum/summarizer/DocumentSummarizer.java create mode 100644 DocumentSummarizer/src/docsum/summarizer/KeywordExtractor.java create mode 100644 DocumentSummarizer/src/docsum/summarizer/SentencePreprocessor.java create mode 100644 DocumentSummarizer/src/docsum/summarizer/SentenceSegmenter.java create mode 100644 DocumentSummarizer/src/docsum/summarizer/stoplist.txt create mode 100644 DocumentSummarizer/src/docsum/ui/GraphicalInterface.java diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..395f6ee --- /dev/null +++ b/.gitignore @@ -0,0 +1,71 @@ +# Directories # +/build/ +/bin/ +target/ + +# OS Files # +.DS_Store + +*.class + +# Package Files # +*.jar +*.war +*.ear +*.db + +###################### +# Windows +###################### + +# Windows image file caches +Thumbs.db + +# Folder config file +Desktop.ini + +###################### +# OSX +###################### + +.DS_Store +.svn + +# Thumbnails +._* + +# Files that might appear on external disk +.Spotlight-V100 +.Trashes + + +###################### +# Eclipse +###################### + +*.pydevproject +.project +.metadata +bin/** +tmp/** +tmp/**/* +*.tmp +*.bak +*.swp +*~.nib +local.properties +.classpath +.settings/ +.loadpath +/src/main/resources/rebel.xml +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# CDT-specific +.cproject + +# PDT-specific +.buildpath diff --git a/DocumentSummarizer/bin/docsum/summarizer/stoplist.txt b/DocumentSummarizer/bin/docsum/summarizer/stoplist.txt new file mode 100644 index 0000000..ad46a69 --- /dev/null +++ b/DocumentSummarizer/bin/docsum/summarizer/stoplist.txt @@ -0,0 +1,678 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +-lrb- +-rrb- +'d +'ll +'re +'s +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +able +about +above +abroad +according +accordingly +across +actually +adj +after +afterwards +again +against +ago +ahead +ain't +all +allow +allows +almost +alone +along +alongside +already +also +although +always +am +amid +amidst +among +amongst +an +and +another +any +anybody +anyhow +anyone +anything +anyway +anyways +anywhere +apart +appear +appreciate +appropriate +are +aren't +around +as +a's +aside +ask +asking +associated +at +available +away +awfully +back +backward +backwards +be +became +because +become +becomes +becoming +been +before +beforehand +begin +behind +being +believe +below +beside +besides +best +better +between +beyond +both +brief +but +by +came +can +cannot +cant +can't +caption +cause +causes +certain +certainly +changes +clearly +c'mon +co +co. +com +come +comes +concerning +consequently +consider +considering +contain +containing +contains +corresponding +could +couldn't +course +c's +currently +dare +daren't +definitely +described +despite +did +didn't +different +directly +do +does +doesn't +doing +done +don't +down +downwards +during +each +edu +eg +eight +eighty +either +else +elsewhere +end +ending +enough +entirely +especially +et +etc +even +ever +evermore +every +everybody +everyone +everything +everywhere +ex +exactly +example +except +fairly +far +farther +few +fewer +fifth +first +five +followed +following +follows +for +forever +former +formerly +forth +forward +found +four +from +further +furthermore +get +gets +getting +given +gives +go +goes +going +gone +got +gotten +greetings +had +hadn't +half +happens +hardly +has +hasn't +have +haven't +having +he +he'd +he'll +hello +help +hence +her +here +hereafter +hereby +herein +here's +hereupon +hers +herself +he's +hi +him +himself +his +hither +hopefully +how +howbeit +however +hundred +i'd +ie +if +ignored +i'll +i'm +immediate +in +inasmuch +inc +inc. +indeed +indicate +indicated +indicates +inner +inside +insofar +instead +into +inward +is +isn't +it +it'd +it'll +its +it's +itself +i've +just +k +keep +keeps +kept +know +known +knows +last +lately +later +latter +latterly +least +less +lest +let +let's +like +liked +likely +likewise +little +look +looking +looks +low +lower +ltd +made +mainly +make +makes +many +may +maybe +mayn't +me +mean +meantime +meanwhile +merely +might +mightn't +mine +minus +miss +more +moreover +most +mostly +mr +mrs +much +must +mustn't +my +myself +n't +name +namely +nd +near +nearly +necessary +need +needn't +needs +neither +never +neverf +neverless +nevertheless +new +next +nine +ninety +no +nobody +non +none +nonetheless +noone +no-one +nor +normally +not +nothing +notwithstanding +novel +now +nowhere +obviously +of +off +often +oh +ok +okay +old +on +once +one +ones +one's +only +onto +opposite +or +other +others +otherwise +ought +oughtn't +our +ours +ourselves +out +outside +over +overall +own +particular +particularly +past +per +perhaps +placed +please +plus +possible +presumably +probably +provided +provides +que +quite +qv +rather +rd +re +really +reasonably +recent +recently +regarding +regardless +regards +relatively +respectively +right +round +said +same +saw +say +saying +says +second +secondly +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sensible +sent +serious +seriously +seven +several +shall +shan't +she +she'd +she'll +she's +should +shouldn't +since +six +so +some +somebody +someday +somehow +someone +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specified +specify +specifying +still +sub +such +sup +sure +take +taken +taking +tell +tends +th +than +thank +thanks +thanx +that +that'll +thats +that's +that've +the +their +theirs +them +themselves +then +thence +there +thereafter +thereby +there'd +therefore +therein +there'll +there're +theres +there's +thereupon +there've +these +they +they'd +they'll +they're +they've +thing +things +think +third +thirty +this +thorough +thoroughly +those +though +three +through +throughout +thru +thus +till +to +together +too +took +toward +towards +tried +tries +truly +try +trying +t's +twice +two +un +under +underneath +undoing +unfortunately +unless +unlike +unlikely +until +unto +up +upon +upwards +us +use +used +useful +uses +using +usually +v +value +various +versus +very +via +viz +vs +want +wants +was +wasn't +way +we +we'd +welcome +well +we'll +went +were +we're +weren't +we've +what +whatever +what'll +what's +what've +when +whence +whenever +where +whereafter +whereas +whereby +wherein +where's +whereupon +wherever +whether +which +whichever +while +whilst +whither +who +who'd +whoever +whole +who'll +whom +whomever +who's +whose +why +will +willing +wish +with +within +without +wonder +won't +would +wouldn't +yes +yet +you +you'd +you'll +your +you're +yours +yourself +yourselves +you've +zero \ No newline at end of file diff --git a/DocumentSummarizer/src/docsum/algorithm/HITSAlgorithm.java b/DocumentSummarizer/src/docsum/algorithm/HITSAlgorithm.java new file mode 100644 index 0000000..de1ddea --- /dev/null +++ b/DocumentSummarizer/src/docsum/algorithm/HITSAlgorithm.java @@ -0,0 +1,216 @@ +package docsum.algorithm; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Keyword extraction based on the paper: + * "Graph-Based Keyword Extraction for Single-Document Summarization" + * http://www.aclweb.org/anthology-new/W/W08/W08-1404.pdf + *

+ * HITS algorithm implementation based on pseudo-code at: + * http://en.wikipedia.org/wiki/HITS_algorithm + * + * @author Evan Dempsey + */ +public class HITSAlgorithm implements KeywordAlgorithm { + + + /** + * Default no-argument constructor. + */ + public HITSAlgorithm() { + + } + + // Generates ordered list of keywords. + public List getKeywords(List> sentences) { + + List wordList = makeWordList(sentences); + Map graph = makeGraph(sentences, wordList); + List orderedNodes = runHITS(graph, 10); + List keywords = makeKeywordList(orderedNodes, wordList); + + return keywords; + } + + /** + * Makes alphabetically ordered list all words in all sentences. + * + * @param sentences List of sentences, each of which is a list. + * @return Alphabetical list of words in sentences. + */ + private List makeWordList(List> sentences) { + List words = new ArrayList(); + + for (List sentence : sentences) { + for (String word : sentence) { + if (!words.contains(word)) + words.add(word); + } + } + + // Sort alphabetically. + Collections.sort(words); + + return words; + } + + /** + * Builds directed word coocurrence graph. There is an + * edge from word A to word B if word B directly follows A in a + * sentence. Each node has a list of incoming and outgoing edges. + * + * @param sentences List of lists of word strings. + * @param wordList Alphabetical list of all words in sentences. + * @return Graph with words as nodes and cooccurrences as edges. + */ + private Map makeGraph(List> sentences, + List wordList) { + + Map graph = new HashMap(); + + // Every word in the word list is represented + // by a node in the graph. Create nodes now + // to avoid problems with nodes that have no + // incoming or outgoing edges. + for (int i=0; i sentence : sentences) { + for (int i=0; i runHITS(Map graph, int k) { + + int numNodes = graph.size(); + + // Arrays for hub and authority scores. + double[] authorityScores = new double[numNodes]; + double[] hubScores = new double[numNodes]; + + // All scores are initially 1. + Arrays.fill(authorityScores, 1.0); + Arrays.fill(hubScores, 1.0); + + // Run authority update step and hub update step + // sequentially for k iterations. + for (int i=0; i scorePairs = new ArrayList(); + for (int i=0; i sorted = new ArrayList(); + for (int i=0; i makeKeywordList(List ordered, List words) { + List keywords = new ArrayList(); + + for (Integer index : ordered) { + keywords.add(words.get(index)); + } + + return keywords; + } +} diff --git a/DocumentSummarizer/src/docsum/algorithm/HITSNode.java b/DocumentSummarizer/src/docsum/algorithm/HITSNode.java new file mode 100644 index 0000000..a46cec7 --- /dev/null +++ b/DocumentSummarizer/src/docsum/algorithm/HITSNode.java @@ -0,0 +1,67 @@ +package docsum.algorithm; + +import java.util.ArrayList; +import java.util.List; + +/** + * Graph node with list of + * incoming and outgoing edges. + * + * @author Evan Dempsey + */ +public class HITSNode { + + List incoming; + List outgoing; + + /** + * Default no-argument constructor that + * initializes incoming and outgoing ArrayLists. + */ + public HITSNode() { + incoming = new ArrayList(); + outgoing = new ArrayList(); + } + + /** + * Adds an incoming edge to the node if + * there is not already an edge from that node. + * + * @param value Incoming node index. + */ + public void addIncoming(int value) { + if (!incoming.contains(value)) { + incoming.add(value); + } + } + + /** + * Adds an outgoing edge to the node if + * there is not already an edge to that node. + * + * @param value Outgoing node index. + */ + public void addOutgoing(int value) { + if (!outgoing.contains(value)) { + outgoing.add(value); + } + } + + /** + * Gets list of incoming edges. + * + * @return List of incoming edges. + */ + public List getIncoming() { + return incoming; + } + + /** + * Gets list of outgoing edges. + * + * @return List of outgoing edges. + */ + public List getOutgoing() { + return incoming; + } +} diff --git a/DocumentSummarizer/src/docsum/algorithm/IndexValuePair.java b/DocumentSummarizer/src/docsum/algorithm/IndexValuePair.java new file mode 100644 index 0000000..a3a77c6 --- /dev/null +++ b/DocumentSummarizer/src/docsum/algorithm/IndexValuePair.java @@ -0,0 +1,18 @@ +package docsum.algorithm; + +/** + * Convenience class for sorting arrays by + * a floating point value associated with each element. + * + * @author Evan Dempsey + */ +public class IndexValuePair implements Comparable{ + + double value; + int index; + + // Comparison takes place on value. + public int compareTo(IndexValuePair c) { + return Double.compare(this.value, c.value); + } +} \ No newline at end of file diff --git a/DocumentSummarizer/src/docsum/algorithm/KeywordAlgorithm.java b/DocumentSummarizer/src/docsum/algorithm/KeywordAlgorithm.java new file mode 100644 index 0000000..4876b1b --- /dev/null +++ b/DocumentSummarizer/src/docsum/algorithm/KeywordAlgorithm.java @@ -0,0 +1,19 @@ +package docsum.algorithm; + +import java.util.List; + +/** + * Interface for keyword extraction algorithms. + * + * @author Evan Dempsey + */ +public interface KeywordAlgorithm { + + /** + * Runs keyword algorithm on tokenized sentence list. + * + * @param sentences List of lists of strings. + * @return List of keyword strings. + */ + public List getKeywords(List> sentences); +} diff --git a/DocumentSummarizer/src/docsum/algorithm/MeadAlgorithm.java b/DocumentSummarizer/src/docsum/algorithm/MeadAlgorithm.java new file mode 100644 index 0000000..860af15 --- /dev/null +++ b/DocumentSummarizer/src/docsum/algorithm/MeadAlgorithm.java @@ -0,0 +1,354 @@ +package docsum.algorithm; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Performs extractive summarization of a document + * by algorithmically selecting a list of sentences + * to include in the summary. + *

+ * Implementation of MEAD algorithm based on paper: + * "Centroid-based summarization of multiple documents" + * http://clair.si.umich.edu/~radev/papers/centroid.pdf + * + * @author Evan Dempsey + */ +public class MeadAlgorithm implements SummarizationAlgorithm { + + Map> docFrequencies; + List terms; + Map averageTermFrequencies; + + /** + * No-argument constructor. + */ + public MeadAlgorithm() { + + } + + /** + * Initializes all data structures. + */ + private void initModel() { + docFrequencies = new HashMap>(); + terms = new ArrayList(); + averageTermFrequencies = new HashMap(); + } + + // Gets selection of sentences to include in summary. + public List getSelection(List> sentences, int percentage) { + + initModel(); + buildModel(sentences); + + List centroidValues = makeCentroidValues(sentences.size()); + List centroidDoc = makeCentroidDocument(centroidValues); + List docCentroidValues = makeDocumentCentroids(sentences, centroidValues, centroidDoc); + double maxCentroidValue = Collections.max(docCentroidValues); + List positionalValues = makePositionalValues(sentences.size(), maxCentroidValue); + List> sentenceVectors = makeSentenceVectors(sentences, terms); + List overlaps = makeFirstSentenceOverlaps(sentenceVectors); + List sentenceScores = makeSentenceScores(docCentroidValues, positionalValues, overlaps); + List summarySelection = makeSummarySelection(sentenceScores, percentage); + + return summarySelection; + } + + /** + * Reads the sentences and extract document frequencies, + * unique terms, and average term frequencies. + * + * @param sentences List of tokenized sentences. + */ + private void buildModel(List> sentences) { + makeDocFrequencies(sentences); + makeTerms(); + makeAverageTermFrequencies(sentences); + } + + + /** + * Slices off a percentage of top ranking sentences. + * + * @param sentenceScores List of IndexValuePair (sentence and score). + * @param percent Percentage of sentences to slice off. + * @return List of indices of sentences to include in summary. + */ + private List makeSummarySelection(List sentenceScores, + int percent) { + + // Calculate the number of sentences in the summary. + int summaryLength = (int) (sentenceScores.size() * percent / (double) 100.0); + + // Make sure the summary is at least 1 sentence long. + if (summaryLength < 1) + summaryLength = 1; + + // Sort the sentence scores from top to bottom. + Collections.sort(sentenceScores); + Collections.reverse(sentenceScores); + + // Take the top scoring sentence indices. + List indices = new ArrayList(); + for (int i=0; i makeSentenceScores(List docCentroidValues, + List positionalValues, List overlaps) { + + List pairs = new ArrayList(); + + for (int i=0; i> makeSentenceVectors( + List> sentences, List sentenceTerms) { + // Sentence vectors are vectors of length N where N is the number of + // different words in the document and the value at the index + // is the number of times that word occurs in the sentence. + + List> sentenceVectors = new ArrayList>(); + + for (List document : sentences) { + List sentenceVector = new ArrayList(); + + for (String term : sentenceTerms) { + sentenceVector.add(Collections.frequency(document, term)); + } + + sentenceVectors.add(sentenceVector); + } + + return sentenceVectors; + } + + /** + * Calculates dot products of all sentence vectors and the + * first sentence in the collection of sentence vectors. + * + * @param sentenceVectors List of sentence vectors. + * @return List of first sentence overlap values. + */ + private List makeFirstSentenceOverlaps(List> sentenceVectors) { + List overlaps = new ArrayList(); + List firstSentence = sentenceVectors.get(0); + + for (int i=0; i vector = sentenceVectors.get(i); + + for (int j=0; j makePositionalValues(int size, double maxCentroidValue) { + List posValues = new ArrayList(); + + for (int i=0; i makeDocumentCentroids(List> sentences, + List centroidValues, + List centroidDoc) { + + List docCentroidValues = new ArrayList(); + + for (List document : sentences) { + double total = 0.0; + + for (String term : centroidDoc) { + if (document.contains(term)) { + total += centroidValues.get(terms.indexOf(term)); + } + } + + docCentroidValues.add(total); + } + + return docCentroidValues; + } + + /** + * Determines the frequency of each word in the document. + * + * @param sentences List of tokenized sentences. + */ + private void makeDocFrequencies(List> sentences) { + + for (int i=0; i docsWithTerm = new ArrayList(); + docsWithTerm.add(i); + docFrequencies.put(word, docsWithTerm); + } + else { + if (!docFrequencies.get(word).contains(i)) { + docFrequencies.get(word).add(i); + } + } + } + } + } + + /** + * Creates alphabetized list of words in document. + */ + private void makeTerms() { + Set keys = docFrequencies.keySet(); + + for (String key : keys) { + terms.add(key); + } + + Collections.sort(terms); + } + + /** + * Computes average term frequency for each word in document. + * + * @param sentences List of tokenized sentences. + */ + private void makeAverageTermFrequencies(List> sentences) { + // Average term frequency = total occurrences in collection / documents in collection + + // Count the total occurrences of each word. + for (List document : sentences) { + for (String term : document) { + if (averageTermFrequencies.containsKey(term)) { + averageTermFrequencies.put(term, averageTermFrequencies.get(term)+1.0); + } + else { + averageTermFrequencies.put(term, 1.0); + } + } + } + + // Average the occurrences over the documents. + int numDocs = sentences.size(); + for (String term : averageTermFrequencies.keySet()) { + averageTermFrequencies.put(term, averageTermFrequencies.get(term) / (float) numDocs); + } + } + + /** + * Calculates the centroid value for each unique word. + * + * @param numSentences Number of sentences in document. + * @return List of centroid values for each word. + */ + private List makeCentroidValues(int numSentences) { + List centroidValues = new ArrayList(); + + for(String term : terms) { + double tf = averageTermFrequencies.get(term); + int df = docFrequencies.get(term).size(); + + centroidValues.add(tf * Math.log10(numSentences / (double) df)); + } + + return centroidValues; + } + + /** + * Builds centroid document by taking words with + * a centroid value above a certain threshold. + * + * @param centroidValues Centroid values of words. + * @return Centroid document as list of strings. + */ + private List makeCentroidDocument(List centroidValues) { + + // Put centroid values into pairs. + ArrayList pairs = new ArrayList(); + + for (int i=0; i 0) + topTerms = 1; + + List centroidDoc = new ArrayList(); + for (int i=0; i getSelection(List> sentences, int percentage); +} diff --git a/DocumentSummarizer/src/docsum/summarizer/DocumentSummarizer.java b/DocumentSummarizer/src/docsum/summarizer/DocumentSummarizer.java new file mode 100644 index 0000000..842ab1b --- /dev/null +++ b/DocumentSummarizer/src/docsum/summarizer/DocumentSummarizer.java @@ -0,0 +1,79 @@ +package docsum.summarizer; + +import java.util.List; + +import docsum.algorithm.MeadAlgorithm; + + +/** + * Preprocesses text, feeds it into the summarization algorithm, + * and constructs summary text from returned sentence indices. + * + * @author Evan Dempsey + */ +public class DocumentSummarizer { + + SentenceSegmenter segmenter; + SentencePreprocessor preprocessor; + MeadAlgorithm mead; + + /** + * Constructor for DocumentSummarizar class. + * + * @param segmenter SentenceSegmenter instance. + * @param preprocessor SentencePreprocessor instance. + */ + public DocumentSummarizer(SentenceSegmenter segmenter, + SentencePreprocessor preprocessor) { + this.segmenter = segmenter; + this.preprocessor = preprocessor; + mead = new MeadAlgorithm(); + } + + /** + * Generates a summary of the input text of the required length. + * + * @param text Text string to summarize. + * @param percentage Percentage of sentences to include in summary. + * @return Summary string. + */ + public String summarize(String text, int percentage) { + + // Only run the summarization algorithm if there + // is text in the source JTextArea. + if (text.length() > 0) { + List> sentences = segmenter.segment(text); + List> preprocessed = preprocessor.process(sentences); + List selection = mead.getSelection(preprocessed, percentage); + List original = segmenter.getOriginalSentences(text); + String summary = buildSummaryString(original, selection); + + return summary; + } + + return ""; + } + + /** + * Puts the summary together using the original + * sentences and the indices of the sentences selected by + * the summarization algorithm. + * + * @param sentences List of sentence strings. + * @param selection List of sentence indices in summary. + * @return Summary string. + */ + private String buildSummaryString(List sentences, + List selection) { + + StringBuilder stringBuilder = new StringBuilder(selection.size()); + + for (int i=0; i> sentences = segmenter.segment(text); + List> processed = preprocessor.process(sentences); + List keywords = hits.getKeywords(processed); + + return makeKeywordString(keywords, 20); + } + + /** + * Joins top k extracted keywords together + * into a comma-separated string. + * + * @param keywords List of keywords. + * @param k Number of keywords to take. + * @return Comma-separated string of keywords. + */ + private String makeKeywordString(List keywords, int k) { + + // If fewer than k keywords are available, + // take as many as possible. + int max = keywords.size(); + if (k < max) + max = k; + + String joined = ""; + + for (int i=0; i stopwords; + + /** + * Default no-argument constructor. + * Reads the stop word list. + */ + public SentencePreprocessor() { + stopwords = readStopwords(); + } + + /** + * Performs standard text preprocessing tasks on tokenized sentences. + * Stop word removal, case normalization, punctuation removal. + * + * @param document List of list of strings representing document. + * @return List of list of lower-case strings with stop words and punctuation removed. + */ + public List> process(List> document) { + return removeStopwords(removePunctuation(makeLowercase(document))); + } + + /** + * Removes words that appear in the stop word list. + * + * @param document List of lists of words in sentences. + * @return Sentence list with stopwords removed. + */ + public List> removeStopwords(List> document) { + + List> processed = new ArrayList>(); + + for (int i=0; i oldSentence = document.get(i); + List newSentence = new ArrayList(); + + for (int j=0; j> removePunctuation(List> document) { + + // Each sentence is a list of Strings and + // the document is a list of sentences. + List> processed = new ArrayList>(); + + // Make a regex pattern to match strings with letters or numbers. + Pattern notPuncPattern = Pattern.compile("[A-Za-z0-9]+"); + + for (List sentence: document) { + List newTokens = new ArrayList(); + + // If a string has letters or numbers, it is not just + // punctuation, so add it to the list. + for (String word : sentence) { + Matcher matcher = notPuncPattern.matcher(word); + + if (matcher.find()) { + newTokens.add(word); + } + } + + processed.add(newTokens); + } + + return processed; + } + + /** + * Makes all words in all sentences lower-case. + * + * @param document List of lists of words in sentences. + * @return List of sentences with all words lower-case. + */ + public List> makeLowercase(List> document) { + + List> processed = new ArrayList>(); + + for (int i=0; i oldSentence = document.get(i); + List newSentence = new ArrayList(); + + for (int j=0; j readStopwords() { + + String stopword = null; + List stopwords = new ArrayList(); + + String fileName = "stoplist.txt"; + InputStream inputStream = getClass().getResourceAsStream(fileName); + + try { + InputStreamReader inputStreamReader = new InputStreamReader(inputStream); + BufferedReader bufferedReader = new BufferedReader(inputStreamReader); + + while ((stopword = bufferedReader.readLine()) != null) { + stopwords.add(stopword); + } + } catch (IOException e) { + e.printStackTrace(); + } + + return stopwords; + } +} diff --git a/DocumentSummarizer/src/docsum/summarizer/SentenceSegmenter.java b/DocumentSummarizer/src/docsum/summarizer/SentenceSegmenter.java new file mode 100644 index 0000000..0c1323b --- /dev/null +++ b/DocumentSummarizer/src/docsum/summarizer/SentenceSegmenter.java @@ -0,0 +1,95 @@ +package docsum.summarizer; + +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.HasWord; +import edu.stanford.nlp.objectbank.TokenizerFactory; +import edu.stanford.nlp.process.CoreLabelTokenFactory; +import edu.stanford.nlp.process.DocumentPreprocessor; +import edu.stanford.nlp.process.PTBTokenizer; + +/** + * Splits text into list of tokenized sentences using + * DocumentPreprocessor class from Stanford CoreNLP library. + * + * @author Evan Dempsey + */ +public class SentenceSegmenter { + + /** + * No-argument constructor. + */ + public SentenceSegmenter() { + + } + + /** + * Splits text into list of tokenized sentences. + * + * @param text Text string. + * @return List of lists of strings representing sentences. + */ + public List> segment(String text) { + + List> sentences = new ArrayList>(); + Reader reader = new StringReader(text); + DocumentPreprocessor preprocessor = new DocumentPreprocessor(reader); + + for (List sentence : preprocessor) { + List tokens = new ArrayList(); + + for (HasWord token : sentence) { + tokens.add(token.word()); + } + + sentences.add(tokens); + } + + return sentences; + } + + /** + * Split texts string into list of untokenized sentences. + * + * @param text Test string. + * @return List of sentence strings. + */ + public List getOriginalSentences (String text) { + + List sentenceList = new ArrayList(); + + Reader reader = new StringReader(text); + DocumentPreprocessor preprocessor = new DocumentPreprocessor(reader); + String tokenizerOptions = "invertible=true"; + TokenizerFactory tf = PTBTokenizer.factory( + new CoreLabelTokenFactory(), + tokenizerOptions); + preprocessor.setTokenizerFactory(tf); + + // Reconstruct original sentence strings. + for (List sentence : preprocessor) { + String sentenceString = ""; + boolean printSpace = true; + + for (HasWord token : sentence) { + CoreLabel cl = (CoreLabel) token; + if (!printSpace) { + sentenceString += cl.get(CoreAnnotations.BeforeAnnotation.class); + printSpace = true; + } + + sentenceString += cl.get(CoreAnnotations.OriginalTextAnnotation.class); + sentenceString += cl.get(CoreAnnotations.AfterAnnotation.class); + } + + sentenceList.add(sentenceString); + } + + return sentenceList; + } +} diff --git a/DocumentSummarizer/src/docsum/summarizer/stoplist.txt b/DocumentSummarizer/src/docsum/summarizer/stoplist.txt new file mode 100644 index 0000000..ad46a69 --- /dev/null +++ b/DocumentSummarizer/src/docsum/summarizer/stoplist.txt @@ -0,0 +1,678 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +-lrb- +-rrb- +'d +'ll +'re +'s +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +able +about +above +abroad +according +accordingly +across +actually +adj +after +afterwards +again +against +ago +ahead +ain't +all +allow +allows +almost +alone +along +alongside +already +also +although +always +am +amid +amidst +among +amongst +an +and +another +any +anybody +anyhow +anyone +anything +anyway +anyways +anywhere +apart +appear +appreciate +appropriate +are +aren't +around +as +a's +aside +ask +asking +associated +at +available +away +awfully +back +backward +backwards +be +became +because +become +becomes +becoming +been +before +beforehand +begin +behind +being +believe +below +beside +besides +best +better +between +beyond +both +brief +but +by +came +can +cannot +cant +can't +caption +cause +causes +certain +certainly +changes +clearly +c'mon +co +co. +com +come +comes +concerning +consequently +consider +considering +contain +containing +contains +corresponding +could +couldn't +course +c's +currently +dare +daren't +definitely +described +despite +did +didn't +different +directly +do +does +doesn't +doing +done +don't +down +downwards +during +each +edu +eg +eight +eighty +either +else +elsewhere +end +ending +enough +entirely +especially +et +etc +even +ever +evermore +every +everybody +everyone +everything +everywhere +ex +exactly +example +except +fairly +far +farther +few +fewer +fifth +first +five +followed +following +follows +for +forever +former +formerly +forth +forward +found +four +from +further +furthermore +get +gets +getting +given +gives +go +goes +going +gone +got +gotten +greetings +had +hadn't +half +happens +hardly +has +hasn't +have +haven't +having +he +he'd +he'll +hello +help +hence +her +here +hereafter +hereby +herein +here's +hereupon +hers +herself +he's +hi +him +himself +his +hither +hopefully +how +howbeit +however +hundred +i'd +ie +if +ignored +i'll +i'm +immediate +in +inasmuch +inc +inc. +indeed +indicate +indicated +indicates +inner +inside +insofar +instead +into +inward +is +isn't +it +it'd +it'll +its +it's +itself +i've +just +k +keep +keeps +kept +know +known +knows +last +lately +later +latter +latterly +least +less +lest +let +let's +like +liked +likely +likewise +little +look +looking +looks +low +lower +ltd +made +mainly +make +makes +many +may +maybe +mayn't +me +mean +meantime +meanwhile +merely +might +mightn't +mine +minus +miss +more +moreover +most +mostly +mr +mrs +much +must +mustn't +my +myself +n't +name +namely +nd +near +nearly +necessary +need +needn't +needs +neither +never +neverf +neverless +nevertheless +new +next +nine +ninety +no +nobody +non +none +nonetheless +noone +no-one +nor +normally +not +nothing +notwithstanding +novel +now +nowhere +obviously +of +off +often +oh +ok +okay +old +on +once +one +ones +one's +only +onto +opposite +or +other +others +otherwise +ought +oughtn't +our +ours +ourselves +out +outside +over +overall +own +particular +particularly +past +per +perhaps +placed +please +plus +possible +presumably +probably +provided +provides +que +quite +qv +rather +rd +re +really +reasonably +recent +recently +regarding +regardless +regards +relatively +respectively +right +round +said +same +saw +say +saying +says +second +secondly +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sensible +sent +serious +seriously +seven +several +shall +shan't +she +she'd +she'll +she's +should +shouldn't +since +six +so +some +somebody +someday +somehow +someone +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specified +specify +specifying +still +sub +such +sup +sure +take +taken +taking +tell +tends +th +than +thank +thanks +thanx +that +that'll +thats +that's +that've +the +their +theirs +them +themselves +then +thence +there +thereafter +thereby +there'd +therefore +therein +there'll +there're +theres +there's +thereupon +there've +these +they +they'd +they'll +they're +they've +thing +things +think +third +thirty +this +thorough +thoroughly +those +though +three +through +throughout +thru +thus +till +to +together +too +took +toward +towards +tried +tries +truly +try +trying +t's +twice +two +un +under +underneath +undoing +unfortunately +unless +unlike +unlikely +until +unto +up +upon +upwards +us +use +used +useful +uses +using +usually +v +value +various +versus +very +via +viz +vs +want +wants +was +wasn't +way +we +we'd +welcome +well +we'll +went +were +we're +weren't +we've +what +whatever +what'll +what's +what've +when +whence +whenever +where +whereafter +whereas +whereby +wherein +where's +whereupon +wherever +whether +which +whichever +while +whilst +whither +who +who'd +whoever +whole +who'll +whom +whomever +who's +whose +why +will +willing +wish +with +within +without +wonder +won't +would +wouldn't +yes +yet +you +you'd +you'll +your +you're +yours +yourself +yourselves +you've +zero \ No newline at end of file diff --git a/DocumentSummarizer/src/docsum/ui/GraphicalInterface.java b/DocumentSummarizer/src/docsum/ui/GraphicalInterface.java new file mode 100644 index 0000000..e44ae46 --- /dev/null +++ b/DocumentSummarizer/src/docsum/ui/GraphicalInterface.java @@ -0,0 +1,557 @@ +package docsum.ui; + +import javax.swing.BorderFactory; +import javax.swing.Box; +import javax.swing.BoxLayout; +import javax.swing.JButton; +import javax.swing.JFileChooser; +import javax.swing.JFrame; +import javax.swing.JLabel; +import javax.swing.JMenu; +import javax.swing.JMenuBar; +import javax.swing.JMenuItem; +import javax.swing.JOptionPane; +import javax.swing.JPanel; +import javax.swing.JScrollPane; +import javax.swing.JSlider; +import javax.swing.JTextArea; +import javax.swing.SwingUtilities; +import javax.swing.event.ChangeEvent; +import javax.swing.event.ChangeListener; +import javax.swing.event.DocumentEvent; +import javax.swing.event.DocumentListener; +import javax.swing.filechooser.FileNameExtensionFilter; +import javax.swing.text.DefaultEditorKit; +import javax.swing.text.Document; + +import docsum.summarizer.DocumentSummarizer; +import docsum.summarizer.KeywordExtractor; +import docsum.summarizer.SentencePreprocessor; +import docsum.summarizer.SentenceSegmenter; + +import java.awt.BorderLayout; +import java.awt.Component; +import java.awt.Dimension; +import java.awt.GridLayout; +import java.awt.event.ActionEvent; +import java.awt.event.ActionListener; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; + +/** + * Graphical user interface for summarizer program. + * + * @author Evan Dempsey + */ +public class GraphicalInterface extends JFrame { + + private static final long serialVersionUID = 6253527329314698074L; + DocumentSummarizer summarizer; + KeywordExtractor extractor; + + JPanel panel; + JTextArea sourceTextArea; + JTextArea keywordTextArea; + JTextArea summaryTextArea; + JLabel percentLabel; + JSlider percentSlider; + JLabel sourceCharsLabel; + JLabel sourceWordsLabel; + JLabel sourceLinesLabel; + JLabel summaryCharsLabel; + JLabel summaryWordsLabel; + JLabel summaryLinesLabel; + + /** + * Constructor. + * + * @param summarizer DocumentSummarizer instance. + * @param extractor KeywordExtractor instance. + */ + public GraphicalInterface(DocumentSummarizer summarizer, + KeywordExtractor extractor) { + this.summarizer = summarizer; + this.extractor = extractor; + initUI(); + } + + /** + * Sets up the user interface. + */ + public void initUI() { + + // Set up the menu bar + JMenuBar menuBar = new JMenuBar(); + + // Set up the file menu + JMenu fileMenu = new JMenu("File"); + + JMenuItem openMenuItem = new JMenuItem("Open"); + openMenuItem.setToolTipText("Open a text document."); + openMenuItem.addActionListener(new OpenActionListener()); + + JMenuItem saveMenuItem = new JMenuItem("Save"); + saveMenuItem.setToolTipText("Save the summarys."); + saveMenuItem.addActionListener(new SaveActionListener()); + + JMenuItem exitMenuItem = new JMenuItem("Exit"); + exitMenuItem.setToolTipText("Exit application."); + exitMenuItem.addActionListener(new QuitActionListener()); + + fileMenu.add(openMenuItem); + fileMenu.add(saveMenuItem); + fileMenu.add(exitMenuItem); + + // Set up the edit menu + JMenu editMenu = new JMenu("Edit"); + + JMenuItem cutMenuItem = new JMenuItem(new DefaultEditorKit.CutAction()); + cutMenuItem.setText("Cut"); + cutMenuItem.setToolTipText("Cut the current selection."); + + JMenuItem copyMenuItem = new JMenuItem(new DefaultEditorKit.CopyAction()); + copyMenuItem.setText("Copy"); + copyMenuItem.setToolTipText("Copy the current selection."); + + JMenuItem pasteMenuItem = new JMenuItem(new DefaultEditorKit.PasteAction()); + pasteMenuItem.setText("Paste"); + pasteMenuItem.setToolTipText("Paste the contents of the clipboard."); + + editMenu.add(cutMenuItem); + editMenu.add(copyMenuItem); + editMenu.add(pasteMenuItem); + + // Set up the help menu + JMenu helpMenu = new JMenu("Help"); + + JMenuItem aboutMenuItem = new JMenuItem("About"); + aboutMenuItem.setToolTipText("About the document summarizer."); + aboutMenuItem.addActionListener(new AboutActionListener()); + + helpMenu.add(aboutMenuItem); + + // Add menus to menu bar + menuBar.add(fileMenu); + menuBar.add(editMenu); + menuBar.add(helpMenu); + + // Make a JScrollPane for the source text and put a JTextArea in it. + JScrollPane sourcePane = new JScrollPane(); + sourcePane.setAlignmentX(Component.LEFT_ALIGNMENT); + sourcePane.setVerticalScrollBarPolicy( + JScrollPane.VERTICAL_SCROLLBAR_ALWAYS); + sourceTextArea = new JTextArea(); + sourceTextArea.setLineWrap(true); + sourceTextArea.setWrapStyleWord(true); + sourceTextArea.setBorder(BorderFactory.createEmptyBorder(8, 8, 8, 8)); + sourcePane.getViewport().add(sourceTextArea); + + // Make a JScrollPane for the keyword text and put a JTextArea in it. + JScrollPane keywordPane = new JScrollPane(); + keywordPane.setAlignmentX(Component.LEFT_ALIGNMENT); + keywordPane.setVerticalScrollBarPolicy( + JScrollPane.VERTICAL_SCROLLBAR_ALWAYS); + keywordTextArea = new JTextArea(); + keywordTextArea.setLineWrap(true); + keywordTextArea.setWrapStyleWord(true); + keywordTextArea.setBorder(BorderFactory.createEmptyBorder(8, 8, 8, 8)); + keywordPane.getViewport().add(keywordTextArea); + keywordPane.setPreferredSize(new Dimension( + keywordPane.getMaximumSize().width, 70)); + keywordPane.setMinimumSize(keywordPane.getPreferredSize()); + keywordPane.setMaximumSize(keywordPane.getPreferredSize()); + + + // Make a JScrollPane for the summary text and put a JTextArea in it. + JScrollPane summaryPane = new JScrollPane(); + summaryPane.setAlignmentX(Component.LEFT_ALIGNMENT); + summaryPane.setVerticalScrollBarPolicy( + JScrollPane.VERTICAL_SCROLLBAR_ALWAYS); + summaryTextArea = new JTextArea(); + summaryTextArea.setLineWrap(true); + summaryTextArea.setWrapStyleWord(true); + summaryTextArea.setBorder(BorderFactory.createEmptyBorder(8, 8, 8, 8)); + summaryPane.getViewport().add(summaryTextArea); + + // Add document listeners to the source and summary text areas. + sourceTextArea.getDocument().addDocumentListener(new TextChangeListener()); + sourceTextArea.getDocument().putProperty("name", "source"); + summaryTextArea.getDocument().addDocumentListener(new TextChangeListener()); + summaryTextArea.getDocument().putProperty("name", "summary"); + + // Make title labels for the three text areas. + JLabel sourceTitleLabel = new JLabel("Source"); + sourceTitleLabel.setAlignmentX(Component.LEFT_ALIGNMENT); + JLabel keywordTitleLabel = new JLabel("Keywords"); + keywordTitleLabel.setAlignmentX(Component.LEFT_ALIGNMENT); + JLabel summaryTitleLabel = new JLabel("Summary"); + summaryTitleLabel.setAlignmentX(Component.LEFT_ALIGNMENT); + + // Make labels for document statistics. + sourceCharsLabel = new JLabel("Characters: "); + sourceCharsLabel.setAlignmentX(Component.LEFT_ALIGNMENT); + sourceWordsLabel = new JLabel("Words: "); + sourceWordsLabel.setAlignmentX(Component.LEFT_ALIGNMENT); + sourceLinesLabel = new JLabel("Lines: "); + sourceLinesLabel.setAlignmentX(Component.LEFT_ALIGNMENT); + + summaryCharsLabel = new JLabel("Characters: "); + summaryCharsLabel.setAlignmentX(Component.LEFT_ALIGNMENT); + summaryWordsLabel = new JLabel("Words: "); + summaryWordsLabel.setAlignmentX(Component.LEFT_ALIGNMENT); + summaryLinesLabel = new JLabel("Lines: "); + summaryLinesLabel.setAlignmentX(Component.LEFT_ALIGNMENT); + + // Set up two JPanels for the left and right of the centerPanel + JPanel leftPanel = new JPanel(); + leftPanel.setLayout(new BoxLayout(leftPanel, BoxLayout.Y_AXIS)); + leftPanel.setBorder(BorderFactory.createEmptyBorder(0, 0, 0, 10)); + JPanel rightPanel = new JPanel(); + rightPanel.setLayout(new BoxLayout(rightPanel, BoxLayout.Y_AXIS)); + rightPanel.setBorder(BorderFactory.createEmptyBorder(0, 10, 0, 0)); + + // Add widgets to the leftPanel + leftPanel.add(sourceTitleLabel); + leftPanel.add(Box.createRigidArea(new Dimension(0,5))); + leftPanel.add(sourcePane); + leftPanel.add(Box.createRigidArea(new Dimension(0,10))); + leftPanel.add(sourceCharsLabel); + leftPanel.add(Box.createRigidArea(new Dimension(0,5))); + leftPanel.add(sourceWordsLabel); + leftPanel.add(Box.createRigidArea(new Dimension(0,5))); + leftPanel.add(sourceLinesLabel); + + // Add widgets to the rightPanel + rightPanel.add(keywordTitleLabel); + rightPanel.add(Box.createRigidArea(new Dimension(0,5))); + rightPanel.add(keywordPane); + rightPanel.add(Box.createRigidArea(new Dimension(0,10))); + rightPanel.add(summaryTitleLabel); + rightPanel.add(Box.createRigidArea(new Dimension(0,5))); + rightPanel.add(summaryPane); + rightPanel.add(Box.createRigidArea(new Dimension(0,10))); + rightPanel.add(summaryCharsLabel); + rightPanel.add(Box.createRigidArea(new Dimension(0,5))); + rightPanel.add(summaryWordsLabel); + rightPanel.add(Box.createRigidArea(new Dimension(0,5))); + rightPanel.add(summaryLinesLabel); + + // Set up the center JPanel + JPanel centerPanel = new JPanel(); + centerPanel.setLayout(new GridLayout(1, 2)); + + // Add widgets to the center JPanel + centerPanel.add(leftPanel, BorderLayout.WEST); + centerPanel.add(rightPanel, BorderLayout.EAST); + + // Set up the percentage slider + percentSlider = new JSlider(); + percentSlider.setBorder(BorderFactory.createTitledBorder("Summary Length")); + percentSlider.setMajorTickSpacing(20); + percentSlider.setMinorTickSpacing(5); + percentSlider.setPaintTicks(true); + percentSlider.setValue(50); + percentSlider.addChangeListener(new SliderChangeListener()); + + // Set up the percentage TextField + percentLabel = new JLabel("50%"); + percentLabel.setText("50%"); + + // Create the summarize button + JButton summarizeButton = new JButton("Summarize"); + summarizeButton.setBounds(50, 60, 80, 30); + summarizeButton.setToolTipText("Summarize the document."); + summarizeButton.addActionListener(new SummarizeActionListener()); + + // Set up the bottom JPanel + JPanel bottomPanel = new JPanel(); + bottomPanel.setLayout(new BoxLayout(bottomPanel, BoxLayout.X_AXIS)); + bottomPanel.setBorder(BorderFactory.createEmptyBorder(20, 0, 0, 0)); + + // Add widgets to bottom panel + bottomPanel.add(percentSlider); + bottomPanel.add(percentLabel); + bottomPanel.add(Box.createHorizontalGlue()); + bottomPanel.add(summarizeButton); + + // Set up the main JPanel + panel = new JPanel(); + panel.setLayout(new BorderLayout()); + panel.setBorder(BorderFactory.createEmptyBorder(20, 20, 20, 20)); + + // Add widgets to main panel + panel.add(centerPanel, BorderLayout.CENTER); + panel.add(bottomPanel, BorderLayout.SOUTH); + + setJMenuBar(menuBar); + add(panel); + setTitle("Document Summarizer"); + setSize(800, 600); + setLocationRelativeTo(null); + setDefaultCloseOperation(EXIT_ON_CLOSE); + } + + /** + * Reads a specified file and return its contents as a string + * + * @param file File object. + * @return String with file contents. + */ + public String readFile(File file) { + + StringBuffer fileBuffer = null; + String fileString = null; + String line = null; + + try { + FileReader fileReader = new FileReader(file); + BufferedReader bufferedReader = new BufferedReader(fileReader); + fileBuffer = new StringBuffer(); + + while ((line = bufferedReader.readLine()) != null) { + fileBuffer.append(line).append( + System.getProperty("line.separator")); + } + + fileReader.close(); + fileString = fileBuffer.toString(); + + } catch (IOException e) { + return null; + } + + return fileString; + } + + /** + * Change listener: updates percentage figure + * in text field in response to slider changes. + * + * @author Evan Dempsey + */ + public class SliderChangeListener implements ChangeListener { + + public void stateChanged(ChangeEvent e) { + JSlider source = (JSlider) e.getSource(); + int val = source.getValue(); + percentLabel.setText(Integer.toString(val) + "%"); + } + } + + // Action listeners: executed in response to + // graphical user interface events. + + /** + * Takes text from the sourceTextArea and the percentage from the percentSlider + * and generates a summary and keyword list. Puts the summary into the + * summaryTextArea and the keyword list into the keywordTextArea. + * + * @author Evan Dempsey + */ + public class SummarizeActionListener implements ActionListener { + public void actionPerformed(ActionEvent e) { + int percentage = percentSlider.getValue(); + String summary = summarizer.summarize(sourceTextArea.getText(), percentage); + String keywords = extractor.extract(summary); + + summaryTextArea.setText(summary); + keywordTextArea.setText(keywords); + } + } + + /** + * Listens for the Open menu item, + * gets a file from the file chooser dialog, + * reads its contents, + * and puts it in the sourceTextArea. + * + * @author Evan Dempsey + */ + public class OpenActionListener implements ActionListener { + public void actionPerformed(ActionEvent e) { + JFileChooser fileOpen = new JFileChooser(); + FileNameExtensionFilter filter = new FileNameExtensionFilter("Text Files", "txt"); + fileOpen.setFileFilter(filter); + + int returnValue = fileOpen.showDialog(panel, "Open File"); + if (returnValue == JFileChooser.APPROVE_OPTION) { + // Read the file + File file = fileOpen.getSelectedFile(); + String text = readFile(file); + + // Put the file contents into the text area + sourceTextArea.setText(text); + } + } + } + + /** + * Listens for the Save menu item and saves the summary. + * + * @author Evan Dempsey + */ + public class SaveActionListener implements ActionListener { + public void actionPerformed(ActionEvent e) { + JFileChooser fileSave = new JFileChooser(); + + int returnValue = fileSave.showDialog(panel, "Save Summary"); + if (returnValue == JFileChooser.APPROVE_OPTION) { + File file = fileSave.getSelectedFile(); + try { + FileWriter fileWriter = new FileWriter(file); + System.out.println(summaryTextArea.getText()); + fileWriter.write(summaryTextArea.getText()); + fileWriter.close(); + } catch (IOException ex) { + ex.printStackTrace(); + } + } + } + } + + /** + * Listens for the Exit menu item and exits the application. + * + * @author Evan Dempsey + */ + public class QuitActionListener implements ActionListener { + + public void actionPerformed(ActionEvent e) { + System.exit(0); + } + } + + + /** + * Listens for the About menu item and displays dialog. + * + * @author Evan Dempsey + */ + public class AboutActionListener implements ActionListener { + public void actionPerformed(ActionEvent e) { + + // Make information string. + StringBuilder info = new StringBuilder(); + info.append(""); + info.append("Document Summarizer

"); + info.append("Automatic document summarization program by Evan Dempsey.

"); + info.append("Penn Treebank tokenizer provided by Stanford NLP toolkit."); + info.append(""); + String infoString = info.toString(); + + // Create JOptionPane with no icon and custom title. + JOptionPane.showMessageDialog(panel, + infoString, + "About Document Summarizer", + JOptionPane.PLAIN_MESSAGE); + } + } + + + /** + * Listens for changes in documents, recalculates + * document statistics and updates stats widgets. + * + * @author Evan Dempsey + */ + public class TextChangeListener implements DocumentListener { + + public void changedUpdate(DocumentEvent e) { + updateStats(e); + } + + public void insertUpdate(DocumentEvent e) { + updateStats(e); + } + + public void removeUpdate(DocumentEvent e) { + updateStats(e); + } + + /** + * Recomputes document statistics and updates widgets. + * + * @param e DocumentEvent. + */ + private void updateStats(DocumentEvent e) { + Document doc = (Document) e.getDocument(); + String source = (String) doc.getProperty("name"); + String text = (source.equals("source")) + ? sourceTextArea.getText() + : summaryTextArea.getText(); + + // Stats counter based on wc command sample implementation. + // http://www.gnu.org/software/cflow/manual/html_node/Source-of-wc-command.html + int ccount = 0; + int wcount = 0; + int lcount = 0; + + int pos = 0; + while (pos < text.length()) { + while (pos < text.length()) { + if (Character.isLetter(text.charAt(pos))) { + wcount++; + break; + } + ccount++; + if (text.charAt(pos) == '\n') { + lcount++; + } + pos++; + } + + while (pos < text.length()) { + ccount++; + if (text.charAt(pos) == '\n') { + lcount++; + } + + if (!Character.isLetter(text.charAt(pos))) { + break; + } + pos++; + } + pos++; + } + + // Put figures into labels. + if (source.equals("source")) { + // Update source statistics widgets. + sourceCharsLabel.setText("Characters: " + ccount); + sourceWordsLabel.setText("Words: " + wcount); + sourceLinesLabel.setText("Lines: " + lcount); + } + else { + // Update summary statistics widgets. + summaryCharsLabel.setText("Characters: " + ccount); + summaryWordsLabel.setText("Words: " + wcount); + summaryLinesLabel.setText("Lines: " + lcount); + } + } + } + + /** + * Main entry point for application. + * + * @param args Command line arguments. + */ + public static void main(String[] args) { + SwingUtilities.invokeLater(new Runnable() { + public void run() { + + // The summarizer and keyword extractor share + // an instance of the preprocessor and segmenter. + SentenceSegmenter seg = new SentenceSegmenter(); + SentencePreprocessor prep = new SentencePreprocessor(); + DocumentSummarizer docsum = new DocumentSummarizer(seg, prep); + KeywordExtractor keyext = new KeywordExtractor(seg, prep); + + GraphicalInterface mainWindow = new GraphicalInterface(docsum, keyext); + mainWindow.setVisible(true); + } + }); + } +}