forked from dream11/crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.js
128 lines (95 loc) · 2.69 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/**
* Created by tushar on 13/09/17.
*/
'use strict'
var request = require('request');
var count = 0;
var linkStrings = {};
var bstrings = [];
var linksArr = [];
var requestCount = 0;
var responseCount = 0;
var concurrentRequests = 0;
var mainUrl = "";
var minString = "zzzzzzzz";
var TimSort = require('timsort');
var done = 0;
var requestsArr = [];
/*var responseTimeoutInstancee = null;
var responseTimeoutDuration = 3000;*/
/**
* Crawls a website using a start {url}, and returns the lexicographically smallest string.
* @param url
* @return {Promise.<string>}
*/
module.exports = url =>
new Promise((resolve, reject) => {
count = 0;
linkStrings = {};
bstrings = [];
linksArr = [];
requestCount = 0;
responseCount = 0;
concurrentRequests = 0;
mainUrl = "";
minString = "zzzzzzzz"
done = 0;
var TimSort = require('timsort');
crawl(url, function(str) {
resolve(str);
});
});
function crawl(url, cb) {
mainUrl = url;
request(url, function(err, res, body) {
parseBody(res.statusCode, body);
loopCrawler(cb,linksArr[count]);
});
}
const loopCrawler = async (cb,url) => {
if (url && !done && requestCount < linksArr.length) {
requestCount++;
request(mainUrl + '/' +url, function(err, res, body) {
if(done) return;
responseCount++;
//console.log(requestCount, responseCount,count,linksArr.length);
if(responseCount>200 && responseCount >= linksArr.length && (count+1)>=linksArr.length){
done = 1;
//TimSort.sort(bstrings);
cb(minString);
}
if (err) {
console.log(err);
linksArr.push(url);
return;
}
parseBody(res.statusCode, body, url);
loopCrawler(cb,linksArr[count]);
count++;
});
}
}
function parseBody(status, body, popUrl) {
if (typeof body == "string") {
var re = /href=\"\/(.*)\"/gm;
var re1 = /link\" href=\"\/(.*)\">([a-z0-9]{4})</gm;
var reh1 = /<h1>(.*)<\/h1>/gm;
var t = body.match(re1);
var m;
while ((m = re.exec(t)) != null) {
var s1 = m[1].substr(0, 32);
re.lastIndex = m.index + 1;
if (!linkStrings[s1]) {
linkStrings[s1] = 1;
linksArr.push(s1);
}
}
var n;
while ((n = reh1.exec(body)) != null) {
var s1 = n[1].substr(0, 6);
reh1.lastIndex = n.index + 1;
if (s1 < minString)
minString = s1;
}
}
}