Skip to content

Commit

Permalink
init of project
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 8, 2023
0 parents commit ac638b2
Show file tree
Hide file tree
Showing 22 changed files with 1,392 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[target.aarch64-unknown-linux-musl]
linker = "aarch64-linux-musl-gcc"
rustflags = ["-C", "target-feature=-crt-static"]
29 changes: 29 additions & 0 deletions .github/workflows/book.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: github pages

on:
push:
branches:
- main
pull_request:

jobs:
deploy:
runs-on: ubuntu-20.04
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
steps:
- uses: actions/checkout@v4

- name: Setup mdBook
uses: peaceiris/actions-mdbook@v1
with:
mdbook-version: 'latest'

- run: cd book && mdbook build

- name: Deploy
uses: peaceiris/actions-gh-pages@v3
if: ${{ github.ref == 'refs/heads/main' }}
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ./book/book
206 changes: 206 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
# Created by https://www.toptal.com/developers/gitignore/api/node
# Edit at https://www.toptal.com/developers/gitignore?templates=node

### Node ###
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*

# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json

# Runtime data
pids
*.pid
*.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage
*.lcov

# nyc test coverage
.nyc_output

# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# Bower dependency directory (https://bower.io/)
bower_components

# node-waf configuration
.lock-wscript

# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release

# Dependency directories
node_modules/
jspm_packages/

# TypeScript v1 declaration files
typings/

# TypeScript cache
*.tsbuildinfo

# Optional npm cache directory
.npm

# Optional eslint cache
.eslintcache

# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variables file
.env
.env.test

# parcel-bundler cache (https://parceljs.org/)
.cache

# Next.js build output
.next

# Nuxt.js build / generate output
.nuxt
dist

# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public

# vuepress build output
.vuepress/dist

# Serverless directories
.serverless/

# FuseBox cache
.fusebox/

# DynamoDB Local files
.dynamodb/

# TernJS port file
.tern-port

# Stores VSCode versions used for testing VSCode extensions
.vscode-test

# End of https://www.toptal.com/developers/gitignore/api/node

# Created by https://www.toptal.com/developers/gitignore/api/macos
# Edit at https://www.toptal.com/developers/gitignore?templates=macos

### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two
Icon


# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

### macOS Patch ###
# iCloud generated files
*.icloud

# End of https://www.toptal.com/developers/gitignore/api/macos

# Created by https://www.toptal.com/developers/gitignore/api/windows
# Edit at https://www.toptal.com/developers/gitignore?templates=windows

### Windows ###
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db

# Dump file
*.stackdump

# Folder config file
[Dd]esktop.ini

# Recycle Bin used on file shares
$RECYCLE.BIN/

# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp

# Windows shortcuts
*.lnk

# End of https://www.toptal.com/developers/gitignore/api/windows

#Added by cargo

/target
Cargo.lock

.pnp.*
.yarn/*
!.yarn/patches
!.yarn/plugins
!.yarn/releases
!.yarn/sdks
!.yarn/versions

*.node

# index.d.ts
# index.js
__test__/*.js

/storage
/bench/*.js
/bench/case/**.js
/bench/storage/
30 changes: 30 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[package]
edition = "2021"
name = "spider_py"
version = "0.0.0"
description = "The fastest web crawler written in Rust ported to nodejs."
repository = "https://github.com/spider-rs/spider-nodejs"

[lib]
crate-type = ["cdylib"]

[dependencies]
compact_str = "0.7.1"
indexmap = "2.1.0"
num_cpus = "1.16.0"
serde = "1.0.193"
serde_json = "1.0.108"
spider = { version = "1.50.20", features = ["budget", "cron", "regex", "cookies", "socks", "chrome", "control" ] }
pyo3 = { version = "0.20.0", features = ["extension-module"] }
pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }

[target.x86_64-unknown-linux-gnu.dependencies]
openssl-sys = { version = "0.9.96", features = ["vendored"] }
openssl = { version = "0.10.60", features = ["vendored"] }

[target.x86_64-unknown-linux-musl.dependencies]
openssl-sys = { version = "0.9.96", features = ["vendored"] }
openssl = { version = "0.10.60", features = ["vendored"] }

[profile.release]
lto = true
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2023 Spider Contributors

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# spider-py

The [spider](https://github.com/spider-rs/spider) project ported to Python.

## Getting Started

When the package is released run the following:

1. `pip install spider-py`

```python
import asyncio

from spider_py import crawl

async def main():
website = await crawl("https://jeffmendez.com")
print(website.links)

asyncio.run(main())
```
## Development

Install maturin `pipx install maturin` and python.

1. `maturin develop`

## Todo

1. Add thread safe callback handling crawl/scrape.
1. Add callback Cron.
1. Add subscription callback.

Once these items are done the base of the module should be complete. Most of the code comes from the initial port to Node.js that was done.
1 change: 1 addition & 0 deletions book/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
book
10 changes: 10 additions & 0 deletions book/book.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[book]
authors = ["Jeff Mendez"]
language = "en"
multilingual = false
src = "src"
title = "spider-py"

[output.html]
git-repository-url = "https://github.com/spider-rs/spider-py/tree/main/book"
edit-url-template = "https://github.com/spider-rs/spider-py/edit/main/book/{path}"
15 changes: 15 additions & 0 deletions book/src/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Introduction

`Spider-Py` is the fastest web crawler and indexer written in Rust ported to Python.

- Concurrent
- Streaming
- Decentralization
- Headless Chrome [Rendering](https://github.com/mattsse/chromiumoxide)
- HTTP Proxies
- Cron Jobs
- Subscriptions
- Blacklisting and Budgeting Depth
- Written in [Rust](https://www.rust-lang.org/) for speed, safety, and simplicity

Spider powers some big tools and helps bring the crawling aspect to almost no downtime with the correct setup, view the [spider](https://github.com/spider-rs/spider) project to learn more.
13 changes: 13 additions & 0 deletions book/src/SUMMARY.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Summary

[Introduction](./README.md)

# User Guide

- [Getting started](./getting-started.md)
- [A simple example](./simple.md)

# Configuration

- [Environment](./env.md)

11 changes: 11 additions & 0 deletions book/src/env.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Environment

Env variables to adjust the project.

## CHROME_URL

You can set the chrome URL to connect remotely.

```sh
CHROME_URL=http://localhost:9222
```
7 changes: 7 additions & 0 deletions book/src/getting-started.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Getting Started

Make sure to have python installed v10 and higher.

```sh
pip install spider-py
```
Loading

0 comments on commit ac638b2

Please sign in to comment.