chore(book): add page docs

spider-rs · Dec 27, 2023 · cd57ee5 · cd57ee5
1 parent b93f9c0
commit cd57ee5
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 0 deletions.
diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md
@@ -10,5 +10,9 @@
 # Configuration
 
 - [Website](./website.md)
+- [Page](./page.md)
 - [Environment](./env.md)
 
+# Usage
+
+- [Cron Job](./cron-job.md)
diff --git a/book/src/cron-job.md b/book/src/cron-job.md
@@ -0,0 +1,20 @@
+# Cron Jobs
+
+Use a cron job that can run any time of day to gather website data.
+
+```python
+import asyncio
+from spider_rs import Website
+
+class Subscription:
+    def __init__(self): 
+        print("Cron Created...") 
+    def __call__(self, page): 
+        print(page.url + " - status: " + str(page.status_code))
+
+async def main():
+    website = Website("https://choosealicense.com").with_cron("1/5 * * * * *").build()
+    handle = await website.run_cron(Subscription);
+
+asyncio.run(main())
+```
diff --git a/book/src/page.md b/book/src/page.md
@@ -0,0 +1,72 @@
+# Page
+
+A single page on a website, useful if you just the root url.
+
+## New Page
+
+Get a new page with content.
+
+The first param is the url, followed by if subdomains should be included, and last to include TLD's in links.
+
+Calling `page.fetch` is needed to get the content.
+
+```python
+import asyncio
+from spider_rs import Page
+
+async def main():
+    page = Page("https://choosealicense.com")
+    page.fetch()
+
+asyncio.run(main())
+```
+
+## Page Links
+
+get all the links related to a page.
+
+```python
+import asyncio
+from spider_rs import Page
+
+async def main():
+    page = Page("https://choosealicense.com")
+    page.fetch()
+    links = page.get_links()
+    print(links)
+asyncio.run(main())
+```
+
+## Page Html
+
+Get the markup for the page or HTML.
+
+```python
+import asyncio
+from spider_rs import Page
+
+async def main():
+    page = Page("https://choosealicense.com")
+    page.fetch()
+    links = page.get_html()
+    print(links)
+
+asyncio.run(main())
+```
+
+## Page Bytes
+
+Get the raw bytes of a page to store the files in a database.
+
+```python
+import asyncio
+from spider_rs import Page
+
+async def main():
+    page = Page("https://choosealicense.com")
+    page.fetch()
+    links = page.get_bytes()
+    print(links)
+
+asyncio.run(main())
+```
diff --git a/book/src/website.md b/book/src/website.md
@@ -272,6 +272,7 @@ class Subscription:
 async def main():
     website = Website("https://choosealicense.com")
     website.crawl(Subscription())
+    # sleep for 2s and stop etc
     website.stop()
 
 asyncio.run(main())