-
Notifications
You must be signed in to change notification settings - Fork 0
/
http_scrap.go
139 lines (119 loc) · 3.09 KB
/
http_scrap.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
// Functions related to requesting resources from HTTP servers and scraping the result.
package main
import (
"context"
"fmt"
"io"
"net/http"
"net/netip"
"net/url"
"slices"
"strconv"
"strings"
"time"
)
type host struct {
ip netip.AddrPort
hostPort string
}
type roundtrip struct {
connId uint
client *http.Client
host *host
url *url.URL
err error
requestTs time.Time
replyTs time.Time
robots robotstxt
scrapedUrls []*url.URL
crawlDelay time.Duration
}
func sliceContainsUrl(urls []*url.URL, needle *url.URL) bool {
return slices.ContainsFunc(urls, func(u *url.URL) bool {
return *u == *needle
})
}
func getUrl(ctx context.Context, client *http.Client, target *url.URL) (*http.Response, error) {
targetUrl := target.String()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, targetUrl, nil)
if err != nil {
err = fmt.Errorf("failed to make request: %w", err)
return nil, err
}
resp, err := client.Do(req)
if err != nil {
err = fmt.Errorf("failed get uri %v: %w", targetUrl, err)
return nil, err
}
return resp, err
}
func isReponseRobotstxt(resp *http.Response) bool {
ctype := resp.Header["Content-Type"]
isPlainText := len(ctype) > 0 &&
strings.HasPrefix(ctype[0], "text/plain;")
u := resp.Request.URL.String()
return strings.HasSuffix(u, "robots.txt") && isPlainText
}
func isResponseHtml(resp *http.Response) bool {
ctype := resp.Header["Content-Type"]
if len(ctype) == 0 {
return strings.HasSuffix(resp.Request.URL.String(), ".html")
}
return strings.HasPrefix(ctype[0], "text/html;")
}
func parseHttp429Headers(headers http.Header) (time.Duration, bool) {
retryFields, found := headers["Retry-After"]
if !found || len(retryFields) == 0 {
return 0, false
}
retryS := retryFields[len(retryFields)-1]
retry, err := strconv.Atoi(retryS)
if err == nil {
return time.Duration(retry) * time.Second, true
}
pattern := "Mon, 02 01 2006 03:04:05 MST"
nextRetry, err := time.Parse(pattern, retryS)
if err == nil {
retry := time.Until(nextRetry)
return retry, true
}
return 0, false
}
func scrapConnection(ctx context.Context, r *roundtrip) *roundtrip {
var resp *http.Response
r.requestTs = time.Now()
resp, r.err = getUrl(ctx, r.client, r.url)
r.replyTs = time.Now()
if r.err != nil {
return r
}
defer resp.Body.Close()
urls := []*url.URL{}
// Server requesting rate-limiting
if resp.StatusCode == 429 {
if retry, ok := parseHttp429Headers(resp.Header); ok {
r.crawlDelay = max(retry, r.crawlDelay)
} else {
r.crawlDelay += time.Second
}
}
// Add url from redirect if it belongs to the same server
location, err := resp.Location()
if err == nil && !sliceContainsUrl(urls, location) {
urls = append(urls, location)
}
if isReponseRobotstxt(resp) {
r.robots = scrapRobotsTxt(resp.Body)
if d, found := r.robots.crawlDelay(); found {
r.crawlDelay = d
}
} else if isResponseHtml(resp) {
sUrls := ScrapHtml(r.url, resp.Body)
urls = append(sUrls, urls...)
} else {
// Persistent connections need to have the body read
io.ReadAll(resp.Body)
}
r.scrapedUrls = urls
return r
}