Skip to content

Commit

Permalink
feat(page): add with_return_page_links configuration [#8]
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Aug 27, 2024
1 parent a6439cc commit 128b66f
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 19 deletions.
14 changes: 7 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
edition = "2021"
name = "spider_rs"
version = "0.0.48"
version = "0.0.49"
repository = "https://github.com/spider-rs/spider-py"
license = "MIT"
description = "The fastest web crawler and indexer."
Expand Down
14 changes: 14 additions & 0 deletions book/src/website.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,20 @@ async def main():
asyncio.run(main())
```

### Return Page Links

Return links found on the page resource.

```py
import asyncio
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com").with_return_page_links(True)

asyncio.run(main())
```

### Custom Headers

Add custom HTTP headers to use when crawling/scraping.
Expand Down
29 changes: 19 additions & 10 deletions src/npage.rs
Original file line number Diff line number Diff line change
@@ -1,33 +1,33 @@
use std::collections::HashMap;

use crate::page::header_map_to_hash_map;
use pyo3::prelude::*;

use spider::{
lazy_static::lazy_static,
packages::scraper::{Html, Selector},
};

use crate::page::header_map_to_hash_map;
use std::collections::{HashMap, HashSet};

/// a simple page object
#[derive(Default, Clone)]
#[pyclass]
pub struct NPage {
#[pyo3(get)]
/// the url found.
/// The url of the resource.
pub url: String,
#[pyo3(get)]
/// the content of the page found.
/// The content of the page found as UTF-8.
pub content: String,
#[pyo3(get)]
/// the HTTP status code.
/// The HTTP status code.
pub status_code: u16,
#[pyo3(get)]
/// the raw content
/// The raw content in bytes.
pub raw_content: Option<Vec<u8>>,
#[pyo3(get)]
/// the headers
/// The HTTP headers.
pub headers: Option<HashMap<String, String>>,
#[pyo3(get)]
/// The links found on the page. Requires the website.builder method website.with_subscription_return_page_links to be set to true.
pub links: Option<HashSet<String>>,
}

/// get the page title.
Expand All @@ -54,6 +54,15 @@ pub fn new_page(res: &spider::page::Page, raw: bool) -> NPage {
Some(ref headers) => Some(header_map_to_hash_map(headers)),
_ => None,
},
links: match res.page_links {
Some(ref links) => Some(
links
.iter()
.map(|link| link.as_ref().to_string())
.collect::<HashSet<String>>(),
),
_ => None,
},
}
}

Expand Down
8 changes: 7 additions & 1 deletion src/page.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use pyo3::{pyclass, pymethods, PyRef, PyRefMut};
use spider::{compact_str::CompactString, reqwest::header::HeaderMap};
use spider::{compact_str::CompactString, hashbrown::HashSet, reqwest::header::HeaderMap};
use std::collections::HashMap;

/// a simple page object
Expand All @@ -15,10 +15,16 @@ pub struct Page {
)>,
/// the url for the page
pub url: String,
/// subdomains being crawled?
pub subdomains: Option<bool>,
/// tld being crawled?
pub tld: Option<bool>,
/// The HTTP status code.
pub status_code: u16,
/// The HTTP headers.
pub headers: Option<HashMap<String, String>>,
/// The links found on the page. Requires the website.builder method website.with_subscription_return_page_links to be set to true.
pub links: Option<HashSet<String>>,
}

/// convert a headermap to hashmap
Expand Down
11 changes: 11 additions & 0 deletions src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,17 @@ impl Website {
slf
}

/// Return the links found on the page in the channel subscriptions. This method does nothing if the `decentralized` is enabled.
pub fn with_return_page_links(
mut slf: PyRefMut<'_, Self>,
return_page_links: bool,
) -> PyRefMut<'_, Self> {
slf
.inner
.with_return_page_links(return_page_links);
slf
}

/// Wait for a delay. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled.
pub fn with_wait_for_delay(
mut slf: PyRefMut<'_, Self>,
Expand Down

0 comments on commit 128b66f

Please sign in to comment.