diff --git a/demo1.py b/demo1.py index 466e87c..80f291f 100644 --- a/demo1.py +++ b/demo1.py @@ -16,7 +16,7 @@ def demo1(): - Assumes 'pubs_nips' exists and that pdf text is present. This can be obtained by running nips_download_parse.py and then nips_add_pdftext.py, or by downloading it - from site. See README.txt + from site (https://sites.google.com/site/researchpooler/home) Side-effects: - will use os call to open a pdf with default program diff --git a/demo2.py b/demo2.py index 997c154..3ef5b16 100644 --- a/demo2.py +++ b/demo2.py @@ -13,7 +13,8 @@ def demo2(): Pre-requisites: - Assumes 'pubs_nips' exists. This can be obtained by running - nips_download_parse.py or by downloading it from site. See README.txt + nips_download_parse.py or by downloading it from site. + (https://sites.google.com/site/researchpooler/home) Side-effects: - will use os call to open a pdf with default program diff --git a/demo3.py b/demo3.py index 65bc941..a6279ab 100644 --- a/demo3.py +++ b/demo3.py @@ -18,7 +18,8 @@ def demo3(): - Assumes 'pubs_nips' exists and contains pdf text inside (under key 'pdf_text'). This can be obtained by running nips_download_parse.py and then nips_add_pdftext.py - or by downloading it from site. See README.txt + or by downloading it from site. + (https://sites.google.com/site/researchpooler/home) Side-effects: - will use os call to open a pdf with default program diff --git a/google_search.py b/google_search.py index 6488a7e..53fb962 100644 --- a/google_search.py +++ b/google_search.py @@ -1,5 +1,5 @@ """ -Functions for searching Google and retrieving urls +Functions for searching Google and retrieving urls to PDFs """ import urllib @@ -8,7 +8,8 @@ def getPDFURL(pdf_title): """ Search google for exact match of the title of this paper - and return the url to the pdf file. + and return the url to the pdf file, or 'notfound' if no exact match was + found. pdf_title: string, name of the paper. Returns url to the PDF, or 'notfound' if unsuccessful diff --git a/pdf_read.py b/pdf_read.py index 30c0d1b..61f7a50 100644 --- a/pdf_read.py +++ b/pdf_read.py @@ -11,7 +11,7 @@ def convertPDF(pdf_path, codec='ascii'): """ - Takes path to a PDF and returns the text inside it + Takes path to a PDF and returns the text inside it as string pdf_path: string indicating path to a .pdf file. Can also be a URL starting with 'http' diff --git a/repool_analysis.py b/repool_analysis.py index 7b6c48e..bb51ed2 100644 --- a/repool_analysis.py +++ b/repool_analysis.py @@ -9,11 +9,11 @@ def publicationSimilarityNaive(train_pubs, test_pub): using a very simple overlap method. train_pubs: list of publications - test_pub: a publication dictionary. Must have key 'pdf_text' with the + test_pub: a publication to compare to. Must contain 'pdf_text' key with the bag of words that occur in that publication returns list of (scores, one for each of the train_pubs. Returns -1 for - any score where a publication does not have the pdf_text available + any score where a publication does not have the pdf_text available. """ if not test_pub.has_key('pdf_text'): diff --git a/repool_util.py b/repool_util.py index fa46c77..b7d51d1 100644 --- a/repool_util.py +++ b/repool_util.py @@ -1,12 +1,12 @@ """ Functions: useful general utils """ -import pickle +import cPickle import re from os import startfile def savePubs(filename, pubs_to_save): """ - backup a list of publications + save a list of publications into a file using Python's pickle filename: string pubs_to_save: List of Publication objects @@ -14,7 +14,7 @@ def savePubs(filename, pubs_to_save): """ file = open(filename, 'w') - pickle.dump(pubs_to_save, file) + cPickle.dump(pubs_to_save, file) file.close() def loadPubs(filename): @@ -25,7 +25,7 @@ def loadPubs(filename): """ unpicklefile = open(filename, 'r') - pubs = pickle.load(unpicklefile) + pubs = cPickle.load(unpicklefile) unpicklefile.close() return pubs @@ -44,6 +44,7 @@ def openPDFs(pdf_lst): def stringToWordDictionary(str): """ Takes a string and returns dictionary that stores frequency of every word. + Some stop words are removed. str: string returns dictionary of word counts for each word. Example: d['hello'] -> 5