bug fixes for my rash initial commit

Mtehabsim · Dec 2, 2011 · 0fb81d6 · 0fb81d6
1 parent c7940cf
commit 0fb81d6
Show file tree

Hide file tree

Showing 5 changed files with 31 additions and 21 deletions.
diff --git a/README b/README
@@ -1,5 +1,6 @@
 REsearch POOLer (repool) 
-v0.1
+Project site: https://sites.google.com/site/researchpooler/home
+
 Authors: Andrej Karpathy <[email protected]> || <[email protected]>, http://cs.stanford.edu/~karpathy/
 
 -------------------------------------------------------------------------------
@@ -20,17 +21,21 @@ STAGE 4 files: create nice web-based UI (maybe using Google App Engine?) to make
 -------------------------------------------------------------------------------
 INSTALLATION
 -------------------------------------------------------------------------------
-Simply download scripts and browse around since the project is young. Unless you want to run the scripts from scratch to generate the NIPS database, download the database file pubs_nips directly from Downloads section here.
-
-Current external Python dependencies:
+1. Download pubs_nips from the site[*] (https://sites.google.com/site/researchpooler/downloads)
+2. Browse around (project is young, no installation needed so far!)
+3. Download/Install current Python dependencies:
 
 BeautifulSoup   [for easy and robust HTML parsing]
-simplejson      [for parsing outputs of Google searches using their API]
 PDFMiner        [for parsing PDFs and extracting text]
+simplejson      [OPTIONAL. for parsing outputs of Google searches using their API]
 
-Note that even if you are missing some of the above, you may still be able to use large
-portion of the library functions.
+4. Enjoy the demos
 
+[*] Instead of downloading the database you can also regenerate the pubs_nips database yourself using the two scripts I wrote. Simply run:
+$> python nips_download_parse.py
+(takes a few seconds) and then
+$> python nips_add_pdftext.py
+(takes potentially an hour or two because it has to download and parse all papers published at NIPS since 2003)
 -------------------------------------------------------------------------------
 EXAMPLE USAGE
 -------------------------------------------------------------------------------

diff --git a/demo1.py b/demo1.py
@@ -2,8 +2,7 @@
 Some examples of fun things that can be done using the current 'API'
 """
 
-from repool_util import loadPubs, openPDFs, stringToWordDictionary
-from pdf_read import convertPDF
+from repool_util import loadPubs, openPDFs
 
 def demo1():
     """
@@ -14,28 +13,32 @@ def demo1():
     3. Open the three latest publications that mention it at least twice
     
     Pre-requisites:
-    - Assumes 'pubs_nips' exists. This can be obtained by running 
-      nips_download_parse.py or by downloading it from site. See README.txt
+    - Assumes 'pubs_nips' exists and that pdf text is present. 
+      This can be obtained by running 
+      nips_download_parse.py and then nips_add_pdftext.py, or by downloading it 
+      from site. See README.txt
     
     Side-effects:
     - will use os call to open a pdf with default program
     """
 
+    print "loading the NIPS publications dataset..."
     pubs = loadPubs('pubs_nips')
 
     # get all papers that mention mnist
-    p = [x['title'] for x in pubs if 'mnist' in x.get('pdf_text',{})]
-    print 'titles of papers that mention MNIST dataset:'
+    p = [x for x in pubs if 'mnist' in x.get('pdf_text',{})]
+    print "titles of papers that mention MNIST dataset:"
     for x in p:
         print x['title']
+    print "total of %d publications mention MNIST." %(len(p),)
 
     # sort by number of occurences
-    occ = [(x['year'], i) for i,x in p if x['pdf_text']['mnist']>1]
+    occ = [(x['year'], x['pdf']) for i,x in enumerate(p) if x['pdf_text']['mnist']>1]
     occ.sort(reverse = True)
 
     # open the top 3 latest in browser
     print "opening the top 3..."
-    openPDFs([x['pdf'] for x in occ[:3]])
+    openPDFs([x for year,x in occ[:3]])
 
 if __name__ == '__main__':
     demo1()
diff --git a/demo2.py b/demo2.py
@@ -19,14 +19,15 @@ def demo2():
     - will use os call to open a pdf with default program
     """
 
+    print "loading the NIPS publications dataset..."
     pubs = loadPubs('pubs_nips')
 
     # get urls that correspond to publications with deep in title
     p = [x['pdf'] for x in pubs if 'deep' in x['title'].lower()]
 
-    if len(p)>10:
-        p=p[:10]
-        print 'oops too many (%d) results! Only opening random 10.' % (len(p),)
+    if len(p)>5:
+        print "oops too many (%d) results! Only opening random 5." % (len(p),)
+        p=p[:5]
 
     openPDFs(p)
 

diff --git a/demo3.py b/demo3.py
@@ -15,7 +15,8 @@ def demo3():
                                           very basic and could be much improved)
     
     Pre-requisites:
-    - Assumes 'pubs_nips' exists. This can be obtained by running 
+    - Assumes 'pubs_nips' exists and contains pdf text inside 
+      (under key 'pdf_text'). This can be obtained by running 
       nips_download_parse.py and then nips_add_pdftext.py 
       or by downloading it from site. See README.txt
     
@@ -32,7 +33,7 @@ def demo3():
     p = {'pdf_text' : bow} #create a dummy publication dict
 
     # calculate similarities to our publications
-    print 'loading database...'
+    print "loading database..."
     pubs = loadPubs('pubs_nips')
     print "computing similarities. (may take while with current implementation)"
     scores = publicationSimilarityNaive(pubs, p)

diff --git a/repool_analysis.py b/repool_analysis.py
@@ -24,7 +24,7 @@ def publicationSimilarityNaive(train_pubs, test_pub):
     words = test_pub['pdf_text'].keys()
 
     for i,p in enumerate(train_pubs):
-        if(i%100==0): print "%d/%d" % (i+1, len(train_pubs))
+        if(i%100==0): print "%d/%d..." % (i, len(train_pubs))
 
         if not p.has_key('pdf_text'): continue