diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index fea8586..5e57145 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -10,5 +10,6 @@ # Configuration - [Website](./website.md) +- [Page](./page.md) - [Environment](./env.md) diff --git a/book/src/page.md b/book/src/page.md new file mode 100644 index 0000000..a561455 --- /dev/null +++ b/book/src/page.md @@ -0,0 +1,72 @@ +# Page + +A single page on a website, useful if you just the root url. + +## New Page + +Get a new page with content. + +The first param is the url, followed by if subdomains should be included, and last to include TLD's in links. + +Calling `page.fetch` is needed to get the content. + +```python +import asyncio +from spider_rs import Page + +async def main(): + page = Page("https://choosealicense.com") + page.fetch() + +asyncio.run(main()) +``` + +## Page Links + +get all the links related to a page. + +```python +import asyncio +from spider_rs import Page + +async def main(): + page = Page("https://choosealicense.com") + page.fetch() + links = page.get_links() + print(links) +asyncio.run(main()) +``` + +## Page Html + +Get the markup for the page or HTML. + +```python +import asyncio +from spider_rs import Page + +async def main(): + page = Page("https://choosealicense.com") + page.fetch() + links = page.get_html() + print(links) + +asyncio.run(main()) +``` + +## Page Bytes + +Get the raw bytes of a page to store the files in a database. + +```python +import asyncio +from spider_rs import Page + +async def main(): + page = Page("https://choosealicense.com") + page.fetch() + links = page.get_bytes() + print(links) + +asyncio.run(main()) +``` diff --git a/book/src/website.md b/book/src/website.md index 1d5045f..6b12a28 100644 --- a/book/src/website.md +++ b/book/src/website.md @@ -272,6 +272,7 @@ class Subscription: async def main(): website = Website("https://choosealicense.com") website.crawl(Subscription()) + # sleep for 2s and stop etc website.stop() asyncio.run(main())