-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrap_html.go
129 lines (110 loc) · 2.35 KB
/
scrap_html.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
// Functions related to scraping urls from HTTP pages.
package main
import (
"io"
"net/url"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
func urlCmp(u1, u2 *url.URL) bool {
return u1.Host == u2.Host && u1.Path == u2.Path
}
func findNodeAtomInNode(n *html.Node, tag atom.Atom) *html.Node {
iter := n.FirstChild
for ; iter != nil; iter = iter.NextSibling {
if iter.DataAtom == tag {
break
}
}
return iter
}
func findAllAtomTagInNode(n *html.Node, tag atom.Atom) []*html.Node {
matches := []*html.Node{}
stack := []*html.Node{}
for iter := n; iter != nil; {
if iter.DataAtom == tag {
matches = append(matches, iter)
}
// Retain the ordering of the original document
for child := iter.LastChild; child != nil; child = child.PrevSibling {
stack = append(stack, child)
}
if len(stack) == 0 {
break
}
iter = stack[len(stack)-1]
stack = stack[:len(stack)-1]
}
return matches
}
func findAtomAttrInNode(n *html.Node, needle atom.Atom) (html.Attribute, bool) {
for _, a := range n.Attr {
if atom.Lookup([]byte(a.Key)) == needle {
return a, true
}
}
return html.Attribute{}, false
}
func findHref(n *html.Node) (*url.URL, error) {
href, found := findAtomAttrInNode(n, atom.Href)
if !found {
return nil, nil
}
return url.Parse(href.Val)
}
func findBaseHrefInNode(n *html.Node) (*url.URL, error) {
head := findNodeAtomInNode(n, atom.Head)
if head == nil {
return nil, nil
}
base := findNodeAtomInNode(head, atom.Base)
if base == nil {
return nil, nil
}
return findHref(base)
}
func ScrapHtml(host *url.URL, body io.Reader) []*url.URL {
urls := []*url.URL{}
doc, err := html.Parse(body)
if err != nil {
return urls
}
root := findNodeAtomInNode(doc, atom.Html)
if root == nil {
return urls
}
baseHref, err := findBaseHrefInNode(root)
if err != nil {
return urls
}
// Parse gettable urls
links := findAllAtomTagInNode(root, atom.A)
for _, n := range links {
u, err := findHref(n)
if u == nil || err != nil {
continue
}
if !u.IsAbs() && baseHref != nil {
u = baseHref.JoinPath(u.String())
}
nUrl := u
if !nUrl.IsAbs() {
path := nUrl.Path
*nUrl = *host
nUrl.Path = path
}
found := false
for _, u := range urls {
found = urlCmp(nUrl, u)
if found {
break
}
}
if found {
continue
}
urls = append(urls, nUrl)
}
io.ReadAll(body)
return urls
}