Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gutenberg Project | Alexandria A. Johnson #77

Open
wants to merge 5 commits into
base: gutenberg
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
source 'https://rubygems.org'
ruby '2.0.0'

gem 'rspec', '~> 2.14.1'
gem 'pry-byebug'
2 changes: 1 addition & 1 deletion gutenberg.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def run!(predictor_klass, opts={})
start_time = Time.now
predictor.train!
puts "Training took #{Time.now - start_time} seconds."

puts "Predicting..."
start_time = Time.now
accuracy = predictor.predict_test_set(opts)
Expand Down
73 changes: 71 additions & 2 deletions lib/complex_predictor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,87 @@ class ComplexPredictor < Predictor
# before the predict() method is called.
#
# Returns nothing.
attr_accessor :data
def train!
@data = {}


@all_books.each do |category, books|
@data[category] = {
words: 0,
books: 0,
top_words: []
}
books.each do |filename, tokens|
# title = find_title(tokens)
# title.each do |t|
# if good_token?(t)
# @data[category][:top_words].push(t)
# end
# end
@data[category][:words] += tokens.count
@data[category][:books] += 1
good_token_count(tokens).each { |x| @data[category][:top_words].push(x) }
end
end
end

def good_token_count(tokens)
@good_token_count = {}
@top_words = []
tokens.each do |t|
if good_token?(t) && @good_token_count[t] == nil
@good_token_count[t] = 1
elsif good_token?(t) && @good_token_count[t]
@good_token_count[t] += 1
end
end
cutoff_val = @good_token_count.values.sort[-50]
top_words = @good_token_count.select {|k,v| v >= cutoff_val}
top_words.each {|word, count| @top_words.push(word)}
@top_words
end

# def find_title(tokens)
# title_start = tokens.index("title")
# title_end = tokens.index("author")
# title = tokens[title_start...title_end]
# title.delete("title")
# title
# end



# Public: Predicts category.
#
# tokens - A list of tokens (words).
#
# Returns a category.

def predict(tokens)
# Always predict astronomy, for now.
:astronomy
end
# :astronomy
predicted_category = nil
counter = 0

predictee_top_words = good_token_count(tokens)
# title_test = find_title(tokens)

@data.each do |category, cat_data|
matching_words = (predictee_top_words & cat_data[:top_words])
max_matches = matching_words.length
if max_matches >= counter
counter = max_matches
predicted_category = category
end
# if title_test.include?(category.to_s)
# predicted_category = category
# end
end
predicted_category
end

end



1 change: 1 addition & 0 deletions lib/simple_predictor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def train!
# philosophy: {
# words: 1000,
# books: 10,
###add good tokens
# },
# archeology: {
# words: 2000,
Expand Down
5 changes: 5 additions & 0 deletions spec/Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
source 'https://rubygems.org'
ruby '2.0.0'

gem 'rspec', '~> 2.14.1'
gem 'pry-byebug'
4 changes: 4 additions & 0 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#require this file in your spec files to help DRY up your tests
require 'rspec'
require 'pry-byebug'
require_relative '../gutenberg.rb'