-
Notifications
You must be signed in to change notification settings - Fork 0
/
minion.go
86 lines (72 loc) · 2 KB
/
minion.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
package cybergrab
import (
"fmt"
"golang.org/x/net/html"
"net/http"
"strings"
"time"
)
type pageMinion struct {
scheduler scheduler
}
func invalidUrl(url string) bool {
return url == ""
}
// Instructs this minion to process the given url.
//
// This minion can do any combination of: }
// 1. Push a new url into the download queue
// 2. Push a new url into the link dispenser queuei, for other minions to run()
// 3. Do nothing
// }
// Returns error or nil for success
func (pg pageMinion) run(url string) error {
fmt.Println("pageMinion: Scrubbing page: " + url)
if invalidUrl(url) {
return fmt.Errorf("URL <%s> is invalid, ignoring.", url)
}
// set a timeout for http.Get()
timeout := time.Duration(WORKER_TIMEOUT_SECONDS * time.Second)
client := http.Client{
Timeout: timeout,
}
resp, err := client.Get(url)
if err != nil {
return nil
}
defer resp.Body.Close()
z := html.NewTokenizer(resp.Body)
// iterate through all hrefs and decide whether to
// download the page, push the page into the queue for
// other PageScrubbers to process, or both.
for tt := z.Next(); tt != html.ErrorToken; tt = z.Next() {
switch {
case tt == html.StartTagToken:
t := z.Token()
// skip tags that are neither links to follow or downloadable content
if !(t.Data == "a" || t.Data == "img" || t.Data == "video" || t.Data == "audio") {
continue
}
for _, attr := range t.Attr {
href_link := attr.Val
// deal with relative links
if !strings.HasPrefix(href_link, "http://") &&
!strings.HasPrefix(href_link, "https://") {
href_link = url + href_link
}
// is this a link tag?
if attr.Key == "href" {
if pg.scheduler.getCrawlPolicy().ShouldCrawl(href_link) {
pg.scheduler.getLinkDispenser().pushUrl(href_link)
}
}
if attr.Key == "src" {
if pg.scheduler.getCrawlPolicy().ShouldDownload(href_link) {
pg.scheduler.getDownloader().addDownload(href_link)
}
}
}
}
}
return nil
}