makersquare · thealaj · Sep 3, 2014 · Sep 4, 2014 · Sep 4, 2014 · Sep 4, 2014
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,5 @@
+source 'https://rubygems.org'
+ruby '2.0.0'
+
+gem 'rspec', '~> 2.14.1'
+gem 'pry-byebug'
diff --git a/gutenberg.rb b/gutenberg.rb
@@ -15,7 +15,7 @@ def run!(predictor_klass, opts={})
   start_time = Time.now
   predictor.train!
   puts "Training took #{Time.now - start_time} seconds."
-
+  
   puts "Predicting..."
   start_time = Time.now
   accuracy = predictor.predict_test_set(opts)

diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb
@@ -5,18 +5,87 @@ class ComplexPredictor < Predictor
   # before the predict() method is called.
   #
   # Returns nothing.
+attr_accessor :data
   def train!
     @data = {}
+
+
+    @all_books.each do |category, books|
+      @data[category] = {
+        words: 0,
+        books: 0,
+        top_words: [] 
+      }
+      books.each do |filename, tokens|
+        # title = find_title(tokens)
+        #   title.each do |t|
+        #     if good_token?(t)
+        #       @data[category][:top_words].push(t)
+        #     end
+        #   end
+        @data[category][:words] += tokens.count
+        @data[category][:books] += 1
+        good_token_count(tokens).each { |x| @data[category][:top_words].push(x) }
+      end
+    end
   end
 
+  def good_token_count(tokens)
+    @good_token_count = {}
+    @top_words = []
+    tokens.each do |t|
+      if good_token?(t) && @good_token_count[t] == nil
+          @good_token_count[t] = 1
+      elsif good_token?(t) && @good_token_count[t]
+          @good_token_count[t] += 1
+      end
+    end
+    cutoff_val = @good_token_count.values.sort[-50]
+    top_words = @good_token_count.select {|k,v| v >= cutoff_val}
+    top_words.each {|word, count| @top_words.push(word)}  
+    @top_words
+  end      
+
+  # def find_title(tokens)
+  # title_start = tokens.index("title") 
+  # title_end = tokens.index("author")
+  # title = tokens[title_start...title_end]
+  # title.delete("title")
+  # title 
+  # end   
+
+
+
   # Public: Predicts category.
   #
   # tokens - A list of tokens (words).
   #
   # Returns a category.
+
   def predict(tokens)
     # Always predict astronomy, for now.
-    :astronomy
-  end
+    # :astronomy
+    predicted_category = nil
+    counter = 0
+
+    predictee_top_words = good_token_count(tokens)
+    # title_test = find_title(tokens)
+
+    @data.each do |category, cat_data|
+    matching_words = (predictee_top_words & cat_data[:top_words])
+      max_matches = matching_words.length
+        if max_matches >= counter
+          counter = max_matches
+          predicted_category = category
+      end
+    #     if title_test.include?(category.to_s)
+    #     predicted_category = category
+    # end 
+    end    
+    predicted_category
+    end
+
 end
 
+
+
diff --git a/lib/simple_predictor.rb b/lib/simple_predictor.rb
@@ -18,6 +18,7 @@ def train!
     #   philosophy: {
     #     words: 1000,
     #     books: 10,
+        ###add good tokens
     #   },
     #   archeology: {
     #     words: 2000,

diff --git a/spec/Gemfile b/spec/Gemfile
@@ -0,0 +1,5 @@
+source 'https://rubygems.org'
+ruby '2.0.0'
+
+gem 'rspec', '~> 2.14.1'
+gem 'pry-byebug'
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
@@ -0,0 +1,4 @@
+#require this file in your spec files to help DRY up your tests
+require 'rspec'
+require 'pry-byebug'
+require_relative '../gutenberg.rb'