forked from floneum/floneum
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.rs
60 lines (53 loc) · 2.1 KB
/
crawl.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
use kalosm::language::*;
use std::future::Future;
use std::io::Write;
use std::pin::Pin;
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
use std::sync::Arc;
#[tokio::main]
async fn main() {
let real_visited = Arc::new(AtomicUsize::new(0));
Page::crawl(
Url::parse("https://dioxuslabs.com/learn/0.5/").unwrap(),
BrowserMode::Static,
move |page: Page| {
let real_visited = real_visited.clone();
Box::pin(async move {
let visited = real_visited.fetch_add(1, Ordering::SeqCst);
if page.url().domain() != Some("dioxuslabs.com") {
return CrawlFeedback::follow_none();
}
let path_prefix = "/learn/0.5/";
if !page.url().path().starts_with(path_prefix) {
return CrawlFeedback::follow_none();
}
let Ok(mut document) = page.html().await else {
return CrawlFeedback::follow_none();
};
let original_length = document.html().len();
let mut simplifier = HtmlSimplifier::default();
simplifier.simplify(&mut document);
let simplified = document.html();
let simplified_length = simplified.len();
let percentage_decrease =
(original_length - simplified_length) as f32 / original_length as f32;
println!(
"simplifing {} -{:.3}% from {:?} to {:?}",
page.url(),
percentage_decrease,
original_length,
simplified_length
);
// write the page to disk
let _ = std::fs::create_dir_all("scraped");
if let Ok(mut file) = std::fs::File::create(format!("scraped/{visited}.html")) {
_ = file.write_all(simplified.as_bytes());
}
CrawlFeedback::follow_all()
}) as Pin<Box<dyn Future<Output = CrawlFeedback>>>
},
)
.await
.unwrap();
}