feat(page): add with_return_page_links configuration [#8]

spider-rs · Aug 27, 2024 · 128b66f · 128b66f
1 parent a6439cc
commit 128b66f
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 19 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 edition = "2021"
 name = "spider_rs"
-version = "0.0.48"
+version = "0.0.49"
 repository = "https://github.com/spider-rs/spider-py"
 license = "MIT"
 description = "The fastest web crawler and indexer."

diff --git a/book/src/website.md b/book/src/website.md
@@ -20,6 +20,20 @@ async def main():
 asyncio.run(main())
 ```
 
+### Return Page Links
+
+Return links found on the page resource.
+
+```py
+import asyncio
+from spider_rs import Website
+
+async def main():
+    website = Website("https://choosealicense.com").with_return_page_links(True)
+
+asyncio.run(main())
+```
+
 ### Custom Headers
 
 Add custom HTTP headers to use when crawling/scraping.

diff --git a/src/npage.rs b/src/npage.rs
@@ -1,33 +1,33 @@
-use std::collections::HashMap;
-
+use crate::page::header_map_to_hash_map;
 use pyo3::prelude::*;
-
 use spider::{
   lazy_static::lazy_static,
   packages::scraper::{Html, Selector},
 };
-
-use crate::page::header_map_to_hash_map;
+use std::collections::{HashMap, HashSet};
 
 /// a simple page object
 #[derive(Default, Clone)]
 #[pyclass]
 pub struct NPage {
   #[pyo3(get)]
-  /// the url found.
+  /// The url of the resource.
   pub url: String,
   #[pyo3(get)]
-  /// the content of the page found.
+  /// The content of the page found as UTF-8.
   pub content: String,
   #[pyo3(get)]
-  /// the HTTP status code.
+  /// The HTTP status code.
   pub status_code: u16,
   #[pyo3(get)]
-  /// the raw content
+  /// The raw content in bytes.
   pub raw_content: Option<Vec<u8>>,
   #[pyo3(get)]
-  /// the headers
+  /// The HTTP headers.
   pub headers: Option<HashMap<String, String>>,
+  #[pyo3(get)]
+  /// The links found on the page. Requires the website.builder method website.with_subscription_return_page_links to be set to true.
+  pub links: Option<HashSet<String>>,
 }
 
 /// get the page title.
@@ -54,6 +54,15 @@ pub fn new_page(res: &spider::page::Page, raw: bool) -> NPage {
       Some(ref headers) => Some(header_map_to_hash_map(headers)),
       _ => None,
     },
+    links: match res.page_links {
+      Some(ref links) => Some(
+        links
+          .iter()
+          .map(|link| link.as_ref().to_string())
+          .collect::<HashSet<String>>(),
+      ),
+      _ => None,
+    },
   }
 }
 

diff --git a/src/page.rs b/src/page.rs
@@ -1,5 +1,5 @@
 use pyo3::{pyclass, pymethods, PyRef, PyRefMut};
-use spider::{compact_str::CompactString, reqwest::header::HeaderMap};
+use spider::{compact_str::CompactString, hashbrown::HashSet, reqwest::header::HeaderMap};
 use std::collections::HashMap;
 
 /// a simple page object
@@ -15,10 +15,16 @@ pub struct Page {
   )>,
   /// the url for the page
   pub url: String,
+  /// subdomains being crawled?
   pub subdomains: Option<bool>,
+  /// tld being crawled?
   pub tld: Option<bool>,
+  /// The HTTP status code.
   pub status_code: u16,
+  /// The HTTP headers.
   pub headers: Option<HashMap<String, String>>,
+  /// The links found on the page. Requires the website.builder method website.with_subscription_return_page_links to be set to true.
+  pub links: Option<HashSet<String>>,
 }
 
 /// convert a headermap to hashmap

diff --git a/src/website.rs b/src/website.rs
@@ -718,6 +718,17 @@ impl Website {
     slf
   }
 
+  /// Return the links found on the page in the channel subscriptions. This method does nothing if the `decentralized` is enabled.
+  pub fn with_return_page_links(
+    mut slf: PyRefMut<'_, Self>,
+    return_page_links: bool,
+  ) -> PyRefMut<'_, Self> {
+    slf
+      .inner
+      .with_return_page_links(return_page_links);
+    slf
+  }
+
   /// Wait for a delay. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled.
   pub fn with_wait_for_delay(
     mut slf: PyRefMut<'_, Self>,