-
Notifications
You must be signed in to change notification settings - Fork 21
/
spider.rb
118 lines (107 loc) · 3.68 KB
/
spider.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
require 'rubygems'
require 'index'
LAST_CRAWLED_PAGES = 'seed.yml'
DO_NOT_CRAWL_TYPES = %w(.pdf .doc .xls .ppt .mp3 .m4v .avi .mpg .rss .xml .json .txt .git .zip .md5 .asc .jpg .gif .png)
USER_AGENT = 'saush-spider'
class Spider
# start the spider
def start
Hpricot.buffer_size = 204800
process(YAML.load_file(LAST_CRAWLED_PAGES))
end
# process the loaded pages
def process(pages)
robot = Robots.new USER_AGENT
until pages.nil? or pages.empty?
newfound_pages = []
pages.each { |page|
begin
if add_to_index?(page) then
uri = URI.parse(page)
host = "#{uri.scheme}://#{uri.host}"
open(page, "User-Agent" => USER_AGENT) { |s|
(Hpricot(s)/"a").each { |a|
url = scrub(a.attributes['href'], host)
newfound_pages << url unless url.nil? or !robot.allowed? url or newfound_pages.include? url
}
}
end
rescue => e
print "\n** Error encountered crawling - #{page} - #{e.to_s}"
rescue Timeout::Error => e
print "\n** Timeout encountered - #{page} - #{e.to_s}"
end
}
pages = newfound_pages
File.open(LAST_CRAWLED_PAGES, 'w') { |out| YAML.dump(newfound_pages, out) }
end
end
# add the page to the index
def add_to_index?(url)
print "\n- indexing #{url}"
t0 = Time.now
page = Page.find(scrub(url))
# if the page is not in the index, then index it
if page.new_record? then
index(url) { |doc_words, title|
dsize = doc_words.size.to_f
puts " [new] - (#{dsize.to_i} words)"
doc_words.each_with_index { |w, l|
printf("\r\e - %6.2f%",(l*100/dsize))
loc = Location.new(:position => l)
loc.word, loc.page, page.title = Word.find(w), page, title
loc.save
}
}
# if it is but it is not fresh, then update it
elsif not page.fresh? then
index(url) { |doc_words, title|
dsize = doc_words.size.to_f
puts " [refreshed] - (#{dsize.to_i} words)"
page.locations.destroy!
doc_words.each_with_index { |w, l|
printf("\r\e - %6.2f%",(l*100/dsize))
loc = Location.new(:position => l)
loc.word, loc.page, page.title = Word.find(w), page, title
loc.save
}
}
page.refresh
#otherwise just ignore it
else
puts " - (x) already indexed"
return false
end
t1 = Time.now
puts " [%6.2f sec]" % (t1 - t0)
return true
end
# scrub the given link
def scrub(link, host=nil)
unless link.nil? then
return nil if DO_NOT_CRAWL_TYPES.include? link[(link.size-4)..link.size] or link.include? '?' or link.include? '/cgi-bin/' or link.include? '&' or link[0..8] == 'javascript' or link[0..5] == 'mailto'
link = link.index('#') == 0 ? '' : link[0..link.index('#')-1] if link.include? '#'
if link[0..3] == 'http'
url = URI.join(URI.escape(link))
else
url = URI.join(host, URI.escape(link))
end
return url.normalize.to_s
end
end
# do the common indexing work
def index(url)
open(url, "User-Agent" => USER_AGENT){ |doc|
h = Hpricot(doc)
title, body = h.search('title').text.strip, h.search('body')
%w(style noscript script form img).each { |tag| body.search(tag).remove}
array = []
body.first.traverse_element {|element| array << element.to_s.strip.gsub(/[^a-zA-Z ]/, '') if element.text? }
array.delete("")
yield(array.join(" ").words, title)
}
end
end
$stdout.sync = true
spider = Spider.new
spider.start