diff --git a/README b/README new file mode 100644 index 0000000..97b14fa --- /dev/null +++ b/README @@ -0,0 +1,91 @@ +REsearch POOLer (repool) +v0.1 +Authors: Andrej Karpathy || , http://cs.stanford.edu/~karpathy/ + +------------------------------------------------------------------------------- +MOTIVATION AND PLAN +------------------------------------------------------------------------------- +- Ever wish you could right away view all papers published based on a keyword in title or abstract? +- Or, ever wish you could look up the most similar paper (content wise) to some paper on some random url? +- How about searching for all papers that report a result on a particular dataset? +-> Literature review is much harder than it should be. + +This set of tools is an initiative to fix this problem. Here's the master plan and types of scripts in this project: + +STAGE 1 scripts: scripts for raw data gathering and parsing that output pickles in an intermediate dictionary-based representation. These will include mostly scripts that download files, parse HTML, etc. +STAGE 2 scripts: scripts that enrich the intermediate representations from STAGE 1 in various ways. For example, a script could iterate over publications in database, and if it finds that some entry is missing its pdf contents, it could attempt a google search to find the pdf, and add it if successful. +STAGE 3 scripts: tools and helper functions that can analyze the intermediate representations and produce higher level scripts that do more interesting things. For example, functionality such as 'find all documents that are similar to this one', or 'find object detection papers in psychology'. All kinds of fun Machine Learning can go here as well, like LDA etc. +STAGE 4 files: create nice web-based UI (maybe using Google App Engine?) to make the project accessible and easy to use. These will be modules that interact with STAGE 3 scripts on the backend. + +------------------------------------------------------------------------------- +INSTALLATION +------------------------------------------------------------------------------- +Simply download scripts and browse around since the project is young. Unless you want to run the scripts from scratch to generate the NIPS database, download the database file pubs_nips directly from Downloads section here. + +Current external Python dependencies: + +BeautifulSoup [for easy and robust HTML parsing] +simplejson [for parsing outputs of Google searches using their API] +PDFMiner [for parsing PDFs and extracting text] + +Note that even if you are missing some of the above, you may still be able to use large +portion of the library functions. + +------------------------------------------------------------------------------- +EXAMPLE USAGE +------------------------------------------------------------------------------- +Say you unexpectedly became very interested in Deep Belief Networks. It now takes 3 lines of python to open all NIPS papers that mention 'deep' in title inside your browser: (also see demo2) + +>>> pubs = loadPubs('pubs_nips') +>>> p = [x['pdf'] for x in pubs if 'deep' in x['title'].lower()] +>>> openPDFs(p) + +Or maybe you want to open all papers that mention MNIST dataset? (demo1 also shows how you can easily go on to open the 3 latest ones.) +>>> pubs = loadPubs('pubs_nips') +>>> p = [x['title'] for x in pubs if 'mnist' in x.get('pdf_text',{})] +>>> openPDFs(p) + +Or how about opening papers that are most similar to some paper at some url? See demo3. + +------------------------------------------------------------------------------- +ORGANIZATION, I/O AND DATA REPRESENTATIONS +------------------------------------------------------------------------------- + +Here's the idea for the near future, I think: there will be several stage1 scripts, each of which is reponsible for parsing a particular venue of publications. For example, the stage1 script nips_download_parse.py parses and outputs all publications in NIPS from 2003 to 2011. (but does not analyze the text) + +The idea is to have very similar scripts for other venues, such as ICML, or CVPR, etc... The output of each such script should be a pickled list of dictionaries. Each dictionary represents a publication. For example: +[{'title': 'Solving AI using Random Forests', 'authors': ['Jim Smith', 'Bill Smith', 'year': 2020, 'venue': 'NIPS 2020', 'pdf': 'http://google.com/ai'}, +...] + +this representation is a flexible start, as some conference pages provide more information than others, and we don't want to force any particular structure from the get go. In other words, the database could contain some papers that have the author, title, and abstract, but not the full text. Another entry might have the full text, but maybe it is missing author or title. Stage 2 scripts will be useful to go over these representations, and fill in details in whatever ways possible. (such as maybe hooking into other sites like Google Scholar, etc?) + +Note: I am well aware that the "flat list of dictionaries pickled in a file" representation isn't scalable. However, I am a believer of avoiding premature encapsulation. Goal is to keep things as flat as possible, as long as possible, and to avoid immediate over-engineering of things. + +Lastly, this representation is actually kinda neat because it lets you run all kinds of nice queries very quickly using list comprehensions. For example: + +#all papers by Andrew Ng +>>> [x['title'] for x in pubs if any("A. Ng" in a for a in x['authors'])] + +------------------------------------------------------------------------------- +FAQ +------------------------------------------------------------------------------- +Q: Your file x does this, but that's bad practice, and you want to do y. Also you have a bug in z. +A: Most likely agreed. These scripts/functions are something I hacked together in 3 days during periods of about 1am to 6am. I'd be happy to hear thoughs/suggestions or see fixes or better/alternative ways of doing things. Check the website for this project, which comes with discussion forum attached. + +------------------------------------------------------------------------------- +MANPOWER ADVERTISEMENTS / IDEAS +------------------------------------------------------------------------------- +Advertisement posting 1: +1. Pick your favorite conference/journal +2. Look through their page and write an HTML parser in style of my nips_download_parse.py +3. Output the same type of representation as described above in a file 'pubs_conferenceblah' +4. Publish the scripts you used so that it is faster for others to do similar things +5. Upload your output pubs_ file, or send it to me for publishing + +Advertisement posting 2: +It should be possible to take arbitrary directory full of PDF files, and create pubs_ file for them. Can Title, Authors be reliably extracted from PDFs somehow? Can tools be made that at least partially automate the process so that different parsers don't have to be written for each venue? Can we find large databases of papers/information online that we can scrape and enter? + +Advertisement posting 3: +Heavy duty Machine Learning tools (such as Naive Bayes, lol) needed that can work on top of representations stored in pubs_ files and answer questions such as: 'what papers in database are most similar to the one on this url?', or, 'What are the common topics?' + +etc etc... \ No newline at end of file diff --git a/demo1.py b/demo1.py new file mode 100644 index 0000000..9f3a718 --- /dev/null +++ b/demo1.py @@ -0,0 +1,41 @@ +""" +Some examples of fun things that can be done using the current 'API' +""" + +from repool_util import loadPubs, openPDFs, stringToWordDictionary +from pdf_read import convertPDF + +def demo1(): + """ + You wrote an algorithm and benchmarked it on the MNIST dataset. You are + wondering how your results compare with those in the literature: + 1. Finds all publications that mention mnist + 2. Print out their titles + 3. Open the three latest publications that mention it at least twice + + Pre-requisites: + - Assumes 'pubs_nips' exists. This can be obtained by running + nips_download_parse.py or by downloading it from site. See README.txt + + Side-effects: + - will use os call to open a pdf with default program + """ + + pubs = loadPubs('pubs_nips') + + # get all papers that mention mnist + p = [x['title'] for x in pubs if 'mnist' in x.get('pdf_text',{})] + print 'titles of papers that mention MNIST dataset:' + for x in p: + print x['title'] + + # sort by number of occurences + occ = [(x['year'], i) for i,x in p if x['pdf_text']['mnist']>1] + occ.sort(reverse = True) + + # open the top 3 latest in browser + print "opening the top 3..." + openPDFs([x['pdf'] for x in occ[:3]]) + +if __name__ == '__main__': + demo1() diff --git a/demo2.py b/demo2.py new file mode 100644 index 0000000..a8edf33 --- /dev/null +++ b/demo2.py @@ -0,0 +1,34 @@ +""" +Some examples of fun things that can be done using the current 'API' +""" + +from repool_util import loadPubs, openPDFs + +def demo2(): + """ + You unexpectedly became very interested in Deep Belief Networks. As a first + stab at some background reading, you want to: + 1. Find all NIPS publications with Deep in title + 2. open them in the browser + + Pre-requisites: + - Assumes 'pubs_nips' exists. This can be obtained by running + nips_download_parse.py or by downloading it from site. See README.txt + + Side-effects: + - will use os call to open a pdf with default program + """ + + pubs = loadPubs('pubs_nips') + + # get urls that correspond to publications with deep in title + p = [x['pdf'] for x in pubs if 'deep' in x['title'].lower()] + + if len(p)>10: + p=p[:10] + print 'oops too many (%d) results! Only opening random 10.' % (len(p),) + + openPDFs(p) + +if __name__ == '__main__': + demo2() diff --git a/demo3.py b/demo3.py new file mode 100644 index 0000000..3e4a95e --- /dev/null +++ b/demo3.py @@ -0,0 +1,54 @@ +""" +Some examples of fun things that can be done using the current 'API' +""" + +from repool_util import loadPubs, stringToWordDictionary, openPDFs +from repool_analysis import publicationSimilarityNaive +from pdf_read import convertPDF + +def demo3(): + """ + You found a cool paper online and you want to find similar papers: + 1. Download and parse the pdf + 2. Compare to text of all publications in pubs_ database + 3. Open the top 3 matches in browser (but note that current matching alg is + very basic and could be much improved) + + Pre-requisites: + - Assumes 'pubs_nips' exists. This can be obtained by running + nips_download_parse.py and then nips_add_pdftext.py + or by downloading it from site. See README.txt + + Side-effects: + - will use os call to open a pdf with default program + """ + + # fetch this pdf from website, parse it, and make a publication dict from it + # here is a random pdf from Andrew's website + url = 'http://ai.stanford.edu/~ang/papers/icml11-DeepEnergyModels.pdf' + print "downloading %s..." % (url,) + text = convertPDF(url) #extract the text + bow = stringToWordDictionary(text) #extract the bag of words representation + p = {'pdf_text' : bow} #create a dummy publication dict + + # calculate similarities to our publications + print 'loading database...' + pubs = loadPubs('pubs_nips') + print "computing similarities. (may take while with current implementation)" + scores = publicationSimilarityNaive(pubs, p) + + # find highest scoring pubs + lst = [(s, i) for i,s in enumerate(scores) if s>=0] + lst.sort(reverse = True) + + # display top 50 matches + m = min(50, len(lst)) + for s, i in lst[:m]: + print "%.2f is similarity to %s." % (s, pubs[i]['title']) + + #open the top 3 in browser + print "opening the top 3..." + openPDFs([pubs[i]['pdf'] for s,i in lst[:3]]) + +if __name__ == '__main__': + demo3() diff --git a/google_search.py b/google_search.py new file mode 100644 index 0000000..6488a7e --- /dev/null +++ b/google_search.py @@ -0,0 +1,31 @@ +""" +Functions for searching Google and retrieving urls +""" + +import urllib +import simplejson + +def getPDFURL(pdf_title): + """ + Search google for exact match of the title of this paper + and return the url to the pdf file. + + pdf_title: string, name of the paper. + Returns url to the PDF, or 'notfound' if unsuccessful + """ + + # get results in JSON + query = urllib.urlencode({'q' : pdf_title + ' filetype:pdf'}) + url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' \ + % (query) + search_results = urllib.urlopen(url) + json = simplejson.loads(search_results.read()) + results = json['responseData']['results'] + + # sift through them in search of an exact match + for r in results: + if r['title'] == '' + pdf_title + '': + return r['url'] + + return 'notfound' + diff --git a/nips_add_pdftext.py b/nips_add_pdftext.py new file mode 100644 index 0000000..00cf89d --- /dev/null +++ b/nips_add_pdftext.py @@ -0,0 +1,55 @@ +""" +Standalone helper script. + +Load nips pubs_ file, and adds to every paper its word counts under key +'pdf_text'. The PDF for each paper is downloaded from NIPS site. +""" + +from repool_util import loadPubs, savePubs, stringToWordDictionary +from pdf_read import convertPDF + +pubs_all = loadPubs('pubs_nips') +print 'loaded pubs with %d entries.' % (len(pubs_all),) + +#possibly place restrictions on pubs to process here +pubs = pubs_all + +for i,p in enumerate(pubs): + + #if the pdf url does not exist, in future this could possibly use google + #search to try to look up a link for the pdf first. + if p.has_key('pdf') and not p.has_key('pdf_text'): + + # try to open the PDF from downloaded location + processed = False + try: + floc = p['pdf'].index('NIPS') + fname = p['pdf'][floc:] + txt = convertPDF('downloads/'+fname) + processed = True + print 'found %s in file!' % (p['title'],) + except: + pass + + if not processed: + # download the PDF and convert to text + try: + print 'downloading pdf for [%s] and parsing...' % (p.get('title', 'an un-titled paper')) + txt = convertPDF(p['pdf']) + processed = True + print 'processed from url!' + except: + print 'error: unable to open download the pdf from %s' % (p['pdf'],) + print 'skipping...' + + if processed: + # convert to bag of words and store + try: + p['pdf_text'] = stringToWordDictionary(txt) + except: + print 'was unable to convert text to bag of words. Skipped.' + + + print '%d/%d = %.2f%% done.' % (i+1, len(pubs), 100*(i+1.0)/len(pubs)) + +savePubs('pubs_nips', pubs_all) \ No newline at end of file diff --git a/nips_download_parse.py b/nips_download_parse.py new file mode 100644 index 0000000..3314f22 --- /dev/null +++ b/nips_download_parse.py @@ -0,0 +1,117 @@ +""" +Standalone helper script. + +Parses NIPS proceedings for years 2003-2010, creates list of dictionaries +that store information about each publication, and saves the result as a +pickle in current directory called pubs_nips. +""" + +import urllib +from BeautifulSoup import BeautifulSoup, Tag, NavigableString +from repool_util import savePubs + +pubs = [] +warnings = [] +for num in range(16, 24): + year = 1987 + num + + url = "http://books.nips.cc/nips%d.html" % (num,) + print "downloading proceedings from NIPS year %d..." % (year,) + f = urllib.urlopen(url) + s = f.read() + f.close() + + print "done. Parsing..." + soup = BeautifulSoup(s) + soup = soup.find('table', {'width' : '600'}) # find the main table HTML + soup = soup.contents[0].contents[0] # descend down and then + + # iterate over this giant linear dump they have on the proceedings site + venue = 'NIPS %d' % (year,) + new_pub = {} + old_count = len(pubs) + for item in soup.contents: + + if isinstance(item, Tag): + if item.name == 'b': + + # we stumbled by a new publication entry. If we were processing + # one before this, then commit that one first then continue + if new_pub: + if not new_pub.has_key('authors'): + warnings.append("oh oh no authors for publication... ") + + if not new_pub.has_key('title'): + warnings.append("oh oh no title for publication... ") + + new_pub['venue'] = venue + new_pub['year']= year + pubs.append(new_pub) + + # start new publication dictionary + new_pub = {} + new_title = str(item.contents[0]) # descend down a tag + new_title = new_title.replace('\n', '') + new_pub['title'] = new_title + + if item.name == 'a': + modifier = str(item.contents[0]).strip() + if modifier == '[pdf]': + new_pub['pdf'] = str(item.attrs[0][1]) + elif modifier == '[bibtex]': + new_pub['bibtex'] = str(item.attrs[0][1]) + elif modifier == '[correction]': + new_pub['correction'] = str(item.attrs[0][1]) + elif modifier == '[supplemental]': + new_pub['supplemental'] = str(item.attrs[0][1]) + elif modifier == '[slide]': + new_pub['slide'] = str(item.attrs[0][1]) + elif modifier == '[audio]': + new_pub['audio'] = str(item.attrs[0][1]) + elif modifier == '[ps.gz]': + pass # ignore + elif modifier == '[djvu]': + pass # ignore + else: + warnings.append("warning: modifier %s skipped" %(modifier,)) + + if isinstance(item, NavigableString): + if len(str(item))>3: + + # this is probably the line describing authors + author_str = str(item) + author_str = author_str.replace('\n', '') # remove newlines + author_list = author_str.split(',') + if new_pub.has_key('authors'): + warnings.append("we're in trouble... %s, but already have "\ + "%s" % (str(item), new_pub['authors'])) + + new_pub['authors'] = [x.strip() for x in author_list] + + # I hate myself a little for this + # TODO LATER_MAYBE: CODE CHUNK DUPLICATION + if not new_pub.has_key('authors'): + warnings.append("oh oh no authors for publication... ") + if not new_pub.has_key('title'): + warnings.append("oh oh no title for publication... ") + new_pub['venue'] = venue + new_pub['year']= year + pubs.append(new_pub) + + print "read in %d publications for year %d." % (len(pubs) - old_count, year) + + +# show warnings, if any were generated +if len(warnings)>0: + print "%d warnings:" % (len(warnings),) + for x in warnings: + print x +else: + print "No warnings generated." + +# finally, save pickle as output +print "read in a total of %d publications." % (len(pubs),) +fname = "pubs_nips" +print "saving pickle in %s" % (fname,) +savePubs(fname, pubs) +print "all done." diff --git a/pdf_read.py b/pdf_read.py new file mode 100644 index 0000000..30c0d1b --- /dev/null +++ b/pdf_read.py @@ -0,0 +1,40 @@ +""" +Functions for PDF parsing tools and utils +""" + +from pdfminer.pdfinterp import PDFResourceManager, process_pdf +from pdfminer.converter import TextConverter +from pdfminer.layout import LAParams +from cStringIO import StringIO + +import urllib + +def convertPDF(pdf_path, codec='ascii'): + """ + Takes path to a PDF and returns the text inside it + + pdf_path: string indicating path to a .pdf file. Can also be a URL starting + with 'http' + codec: can be 'ascii', 'utf-8', ... + returns string of the pdf, as it comes out raw from PDFMiner + """ + + if pdf_path[:4] == 'http': + print 'first downloading %s ...' % (pdf_path,) + urllib.urlretrieve(pdf_path, 'temp.pdf') + pdf_path = 'temp.pdf' + + rsrcmgr = PDFResourceManager() + retstr = StringIO() + laparams = LAParams() + device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) + + fp = file(pdf_path, 'rb') + process_pdf(rsrcmgr, device, fp) + fp.close() + device.close() + + str = retstr.getvalue() + retstr.close() + + return str diff --git a/repool_analysis.py b/repool_analysis.py new file mode 100644 index 0000000..2ae2fdf --- /dev/null +++ b/repool_analysis.py @@ -0,0 +1,39 @@ +""" +Functions: some stage 3 functions. Analyze pubs_ files and +provide more high-level functionality +""" + +def publicationSimilarityNaive(train_pubs, test_pub): + """ + Find similarities of publications to some particular publication, + using a very simple overlap method. + + train_pubs: list of publications + test_pub: a publication dictionary. Must have key 'pdf_text' with the + bag of words that occur in that publication + + returns list of (scores, one for each of the train_pubs. Returns -1 for + any score where a publication does not have the pdf_text available + """ + + if not test_pub.has_key('pdf_text'): + return [] + + scores = [-1 for i in range(len(train_pubs))] + wnum_test = len(test_pub['pdf_text']) + words = test_pub['pdf_text'].keys() + + for i,p in enumerate(train_pubs): + if(i%100==0): print "%d/%d" % (i+1, len(train_pubs)) + + if not p.has_key('pdf_text'): continue + + #find score of the match + wnum_train = len(p['pdf_text']) + + #a random thing I just thought of 5 seconds ago + overlap = sum([1 for x in words if x in p['pdf_text'].keys()]) + scores[i] = 2.0 * overlap / (wnum_train + wnum_test) + + return scores + diff --git a/repool_util.py b/repool_util.py new file mode 100644 index 0000000..fa46c77 --- /dev/null +++ b/repool_util.py @@ -0,0 +1,74 @@ +""" Functions: useful general utils """ + +import pickle +import re +from os import startfile + +def savePubs(filename, pubs_to_save): + """ + backup a list of publications + filename: string + pubs_to_save: List of Publication objects + + returns nothing + """ + + file = open(filename, 'w') + pickle.dump(pubs_to_save, file) + file.close() + +def loadPubs(filename): + """ + retrieve a saved list of publications + filename: string + returns list of dictionaries, each representing a Publication + """ + + unpicklefile = open(filename, 'r') + pubs = pickle.load(unpicklefile) + unpicklefile.close() + return pubs + +def openPDFs(pdf_lst): + """ + uses an os call to open a list of pdfs + pdf_lst: list of strings: paths (or urls) of pdfs to open + """ + if len(pdf_lst)>10: + print "more than 10? that can't be right. Request denied." + return + + for x in pdf_lst: + startfile(x) + +def stringToWordDictionary(str): + """ + Takes a string and returns dictionary that stores frequency of every word. + + str: string + returns dictionary of word counts for each word. Example: d['hello'] -> 5 + """ + str = str.lower() #convert to lower case + m = re.findall('[a-zA-Z\-]+', str) + m = [x for x in m if len(x) > 2] #filter out small words + + # count number of occurences of each word in dict and return it + d = {} + for i in m: d[i] = d.get(i,0) + 1 + + # remove stopwords + stopwords = ['the', 'and', 'for', 'that', 'can', 'this', 'which', \ + 'where', 'are', 'from', 'our', 'not', 'with', 'use', \ + 'then', 'than', 'but', 'have', 'was', 'were', 'these', \ + 'each', 'used', 'set', 'such', 'using', 'when', 'those' \ + 'may', 'also'] + + #cid is some kind of artifact from the pdf conversion that occurs very often + stopwords.extend(['cid']) + + keys = d.keys() + for k in keys: + if k in stopwords: + del d[k] + + return d