diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..1f47892 --- /dev/null +++ b/Gemfile @@ -0,0 +1,5 @@ +source 'https://rubygems.org' +ruby '2.0.0' + +gem 'rspec', '~> 2.14.1' +gem 'pry-byebug' diff --git a/gutenberg.rb b/gutenberg.rb index 84d20f6..c4c342c 100644 --- a/gutenberg.rb +++ b/gutenberg.rb @@ -15,7 +15,7 @@ def run!(predictor_klass, opts={}) start_time = Time.now predictor.train! puts "Training took #{Time.now - start_time} seconds." - + puts "Predicting..." start_time = Time.now accuracy = predictor.predict_test_set(opts) diff --git a/lib/complex_predictor.rb b/lib/complex_predictor.rb index b8921f3..7e5286e 100644 --- a/lib/complex_predictor.rb +++ b/lib/complex_predictor.rb @@ -5,18 +5,87 @@ class ComplexPredictor < Predictor # before the predict() method is called. # # Returns nothing. +attr_accessor :data def train! @data = {} + + + @all_books.each do |category, books| + @data[category] = { + words: 0, + books: 0, + top_words: [] + } + books.each do |filename, tokens| + # title = find_title(tokens) + # title.each do |t| + # if good_token?(t) + # @data[category][:top_words].push(t) + # end + # end + @data[category][:words] += tokens.count + @data[category][:books] += 1 + good_token_count(tokens).each { |x| @data[category][:top_words].push(x) } + end + end end + def good_token_count(tokens) + @good_token_count = {} + @top_words = [] + tokens.each do |t| + if good_token?(t) && @good_token_count[t] == nil + @good_token_count[t] = 1 + elsif good_token?(t) && @good_token_count[t] + @good_token_count[t] += 1 + end + end + cutoff_val = @good_token_count.values.sort[-50] + top_words = @good_token_count.select {|k,v| v >= cutoff_val} + top_words.each {|word, count| @top_words.push(word)} + @top_words + end + + # def find_title(tokens) + # title_start = tokens.index("title") + # title_end = tokens.index("author") + # title = tokens[title_start...title_end] + # title.delete("title") + # title + # end + + + # Public: Predicts category. # # tokens - A list of tokens (words). # # Returns a category. + def predict(tokens) # Always predict astronomy, for now. - :astronomy - end + # :astronomy + predicted_category = nil + counter = 0 + + predictee_top_words = good_token_count(tokens) + # title_test = find_title(tokens) + + @data.each do |category, cat_data| + matching_words = (predictee_top_words & cat_data[:top_words]) + max_matches = matching_words.length + if max_matches >= counter + counter = max_matches + predicted_category = category + end + # if title_test.include?(category.to_s) + # predicted_category = category + # end + end + predicted_category + end + end + + diff --git a/lib/simple_predictor.rb b/lib/simple_predictor.rb index 6b93003..72ed759 100644 --- a/lib/simple_predictor.rb +++ b/lib/simple_predictor.rb @@ -18,6 +18,7 @@ def train! # philosophy: { # words: 1000, # books: 10, + ###add good tokens # }, # archeology: { # words: 2000, diff --git a/spec/Gemfile b/spec/Gemfile new file mode 100644 index 0000000..1f47892 --- /dev/null +++ b/spec/Gemfile @@ -0,0 +1,5 @@ +source 'https://rubygems.org' +ruby '2.0.0' + +gem 'rspec', '~> 2.14.1' +gem 'pry-byebug' diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 0000000..a950747 --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,4 @@ +#require this file in your spec files to help DRY up your tests +require 'rspec' +require 'pry-byebug' +require_relative '../gutenberg.rb'