diff --git a/README.md b/README.md index 9515f47..e81ab9f 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,10 @@ Leverage the power of Spider in your Python applications. Navigate to our [Pytho Integrate Spider effortlessly into your Javascript projects. Visit our [Javascript client library directory](./javascript/) to explore how you can utilize Spider in Node.js or browser environments. Enhance your web scraping capabilities, improve data collection strategies, and unlock new possibilities with our cutting-edge technology. +## Rust (WIP) + +Integrate Spider effortlessly into your Rust projects. Visit our [Rust client library directory](./rust/) to explore how you can utilize Spider in your applications. Enhance your web scraping capabilities, improve data collection strategies, and unlock new possibilities with our cutting-edge technology. + --- ### Features diff --git a/rust/Cargo.lock b/rust/Cargo.lock new file mode 100644 index 0000000..b3b73aa --- /dev/null +++ b/rust/Cargo.lock @@ -0,0 +1,1241 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "autocfg" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + +[[package]] +name = "backtrace" +version = "0.3.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "bytes" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" + +[[package]] +name = "cc" +version = "1.0.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74b6a57f98764a267ff415d50a25e6e166f3831a5071af4995296ea97d210490" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + +[[package]] +name = "encoding_rs" +version = "0.8.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "fastrand" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures-channel" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +dependencies = [ + "futures-core", +] + +[[package]] +name = "futures-core" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" + +[[package]] +name = "futures-io" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" + +[[package]] +name = "futures-macro" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" + +[[package]] +name = "futures-task" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" + +[[package]] +name = "futures-util" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +dependencies = [ + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "gimli" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" + +[[package]] +name = "h2" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "0.14.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f361cde2f109281a220d4307746cdfd5ee3f410da58a70377762396775634b33" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-tls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +dependencies = [ + "bytes", + "hyper", + "native-tls", + "tokio", + "tokio-native-tls", +] + +[[package]] +name = "idna" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indexmap" +version = "2.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "ipnet" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" + +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + +[[package]] +name = "js-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "miniz_oxide" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" +dependencies = [ + "adler", +] + +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.48.0", +] + +[[package]] +name = "native-tls" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "object" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "081b846d1d56ddfc18fdf1a922e4f6e07a11768ea1b92dec44e42b72712ccfce" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "openssl" +version = "0.10.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" +dependencies = [ + "bitflags 2.6.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-sys" +version = "0.9.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets 0.52.6", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "pin-project-lite" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" +dependencies = [ + "bitflags 2.6.0", +] + +[[package]] +name = "reqwest" +version = "0.11.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-tls", + "ipnet", + "js-sys", + "log", + "mime", + "native-tls", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "system-configuration", + "tokio", + "tokio-native-tls", + "tokio-util", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "winreg", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "rustix" +version = "0.38.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +dependencies = [ + "bitflags 2.6.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls-pemfile" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" +dependencies = [ + "base64", +] + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "schannel" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "security-framework" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0" +dependencies = [ + "bitflags 2.6.0", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "serde" +version = "1.0.203" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.203" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.120" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" +dependencies = [ + "libc", +] + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "socket2" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "spider-client" +version = "0.1.0" +dependencies = [ + "reqwest", + "serde", + "serde_json", + "tokio", + "tokio-stream", +] + +[[package]] +name = "syn" +version = "2.0.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" + +[[package]] +name = "system-configuration" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "tempfile" +version = "3.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +dependencies = [ + "cfg-if", + "fastrand", + "rustix", + "windows-sys 0.52.0", +] + +[[package]] +name = "tinyvec" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6b6a2fb3a985e99cebfaefa9faa3024743da73304ca1c683a36429613d3d22" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.38.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "num_cpus", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.48.0", +] + +[[package]] +name = "tokio-macros" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "267ac89e0bec6e691e5813911606935d77c476ff49024f98abcea3e7b15e37af" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "pin-project-lite", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "unicode-bidi" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "unicode-normalization" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "url" +version = "2.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" + +[[package]] +name = "wasm-streams" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "web-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] diff --git a/rust/Cargo.toml b/rust/Cargo.toml new file mode 100644 index 0000000..9b5facc --- /dev/null +++ b/rust/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "spider-client" +version = "0.1.0" +edition = "2021" +authors = [ "j-mendez "] +description = "Spider Cloud client" +license = "MIT" +readme = "README.md" +repository = "https://github.com/spider-rs/spider-clients" +keywords = ["crawler", "web-crawler", "spider", "web-indexer", "site-map-generator"] +categories = ["web-programming"] +include = ["src/*", "../../LICENSE", "README.md"] + +[dependencies] +reqwest = { version = "0.11", features = ["json", "stream"] } +tokio = { version = "1", features = ["full"] } +serde = { version = "1", features = ["derive"] } +serde_json = { version = "1" } +tokio-stream = "0.1.15" diff --git a/rust/README.md b/rust/README.md new file mode 100644 index 0000000..aee28a2 --- /dev/null +++ b/rust/README.md @@ -0,0 +1,300 @@ +# Spider Cloud Rust SDK + +The Spider Cloud Rust SDK offers a toolkit for straightforward website scraping, crawling at scale, and other utilities like extracting links and taking screenshots, enabling you to collect data formatted for compatibility with language models (LLMs). It features a user-friendly interface for seamless integration with the Spider Cloud API. + +-- +Current WIP +-- + +## Installation + +To use the Spider Cloud Rust SDK, include the following in your `Cargo.toml`: + +```toml +[dependencies] +spider-client = "0.1" +``` + +## Usage + +1. Get an API key from [spider.cloud](https://spider.cloud) +2. Set the API key as an environment variable named `SPIDER_API_KEY` or pass it as an argument when creating an instance of the `Spider` struct. + +Here's an example of how to use the SDK: + +```rust +use serde_json::json; +use std::env; + +#[tokio::main] +async fn main() { + // Set the API key as an environment variable + env::set_var("SPIDER_API_KEY", "your_api_key"); + + // Initialize the Spider with your API key + let spider = Spider::new(None).expect("API key must be provided"); + + let url = "https://spider.cloud"; + + // Scrape a single URL + let scraped_data = spider.scrape_url(url, None, false, "application/json").await.expect("Failed to scrape the URL"); + + println!("Scraped Data: {:?}", scraped_data); + + // Crawl a website + let crawler_params = RequestParams { + limit: Some(1), + proxy_enabled: Some(true), + store_data: Some(false), + metadata: Some(false), + request: Some(RequestType::Http), + ..Default::default() + }; + + let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::).await.expect("Failed to crawl the URL"); + + println!("Crawl Result: {:?}", crawl_result); +} +``` + +### Scraping a URL + +To scrape data from a single URL: + +```rust +let url = "https://example.com"; +let scraped_data = spider.scrape_url(url, None, false, "application/json").await.expect("Failed to scrape the URL"); +``` + +### Crawling a Website + +To automate crawling a website: + +```rust +let url = "https://example.com"; +let crawl_params = RequestParams { + limit: Some(200), + request: Some(RequestType::Smart), + ..Default::default() +}; +let crawl_result = spider.crawl_url(url, Some(crawl_params), false, "application/json", None::).await.expect("Failed to crawl the URL"); +``` + +#### Crawl Streaming + +Stream crawl the website in chunks to scale with a callback: + +```rust +fn handle_json(json_obj: serde_json::Value) { + println!("Received chunk: {:?}", json_obj); +} + +let url = "https://example.com"; +let crawl_params = RequestParams { + limit: Some(200), + store_data: Some(false), + ..Default::default() +}; + +spider.crawl_url( + url, + Some(crawl_params), + true, + "application/json", + Some(handle_json) +).await.expect("Failed to crawl the URL"); +``` + +### Search + +Perform a search for websites to crawl or gather search results: + +```rust +let query = "a sports website"; +let crawl_params = RequestParams { + request: Some(RequestType::Smart), + search_limit: Some(5), + limit: Some(5), + fetch_page_content: Some(true), + ..Default::default() +}; +let crawl_result = spider.search(query, Some(crawl_params), false, "application/json").await.expect("Failed to perform search"); +``` + +### Retrieving Links from a URL(s) + +Extract all links from a specified URL: + +```rust +let url = "https://example.com"; +let links = spider.links(url, None, false, "application/json").await.expect("Failed to retrieve links from URL"); +``` + +### Transform + +Transform HTML to markdown or text lightning fast: + +```rust +let data = vec![json!({"html": "

Hello world

"})]; +let params = RequestParams { + readability: Some(false), + return_format: Some(ReturnFormat::Markdown), + ..Default::default() +}; +let result = spider.transform(data, Some(params), false, "application/json").await.expect("Failed to transform HTML to markdown"); +println!("Transformed Data: {:?}", result); +``` + +### Taking Screenshots of a URL(s) + +Capture a screenshot of a given URL: + +```rust +let url = "https://example.com"; +let screenshot = spider.screenshot(url, None, false, "application/json").await.expect("Failed to take screenshot of URL"); +``` + +### Extracting Contact Information + +Extract contact details from a specified URL: + +```rust +let url = "https://example.com"; +let contacts = spider.extract_contacts(url, None, false, "application/json").await.expect("Failed to extract contacts from URL"); +println!("Extracted Contacts: {:?}", contacts); +``` + +### Labeling Data from a URL(s) + +Label the data extracted from a particular URL: + +```rust +let url = "https://example.com"; +let labeled_data = spider.label(url, None, false, "application/json").await.expect("Failed to label data from URL"); +println!("Labeled Data: {:?}", labeled_data); +``` + +### Checking Crawl State + +You can check the crawl state of a specific URL: + +```rust +let url = "https://example.com"; +let state = spider.get_crawl_state(url, None, false, "application/json").await.expect("Failed to get crawl state for URL"); +println!("Crawl State: {:?}", state); +``` + +### Downloading Files + +You can download the results of the website: + +```rust +let url = "https://example.com"; +let options = hashmap!{ + "page" => 0, + "limit" => 100, + "expiresIn" => 3600 // Optional, add if needed +}; +let response = spider.create_signed_url(Some(url), Some(options)).await.expect("Failed to create signed URL"); +println!("Download URL: {:?}", response); +``` + +### Checking Available Credits + +You can check the remaining credits on your account: + +```rust +let credits = spider.get_credits().await.expect("Failed to get credits"); +println!("Remaining Credits: {:?}", credits); +``` + +### Data Operations + +The Spider client can now interact with specific data tables to create, retrieve, and delete data. + +#### Retrieve Data from a Table + +To fetch data from a specified table by applying query parameters: + +```rust +let table_name = "pages"; +let query_params = RequestParams { + limit: Some(20), + ..Default::default() +}; +let response = spider.data_get(table_name, Some(query_params)).await.expect("Failed to retrieve data from table"); +println!("Data from table: {:?}", response); +``` + +#### Delete Data from a Table + +To delete data from a specified table based on certain conditions: + +```rust +let table_name = "websites"; +let delete_params = RequestParams { + domain: Some("www.example.com".to_string()), + ..Default::default() +}; +let response = spider.data_delete(table_name, Some(delete_params)).await.expect("Failed to delete data from table"); +println!("Delete Response: {:?}", response); +``` + +## Streaming + +If you need to use streaming, set the `stream` parameter to `true` and provide a callback function: + +```rust +fn handle_json(json_obj: serde_json::Value) { + println!("Received chunk: {:?}", json_obj); +} + +let url = "https://example.com"; +let crawler_params = RequestParams { + limit: Some(1), + proxy_enabled: Some(true), + store_data: Some(false), + metadata: Some(false), + request: Some(RequestType::Http), + ..Default::default() +}; + +spider.links(url, Some(crawler_params), true, "application/json").await.expect("Failed to retrieve links from URL"); +``` + +## Content-Type + +The following Content-type headers are supported using the `content_type` parameter: + +- `application/json` +- `text/csv` +- `application/xml` +- `application/jsonl` + +```rust +let url = "https://example.com"; + +let crawler_params = RequestParams { + limit: Some(1), + proxy_enabled: Some(true), + store_data: Some(false), + metadata: Some(false), + request: Some(RequestType::Http), + ..Default::default() +}; + +// Stream JSON lines back to the client +spider.crawl_url(url, Some(crawler_params), true, "application/jsonl", None::).await.expect("Failed to crawl the URL"); +``` + +## Error Handling + +The SDK handles errors returned by the Spider Cloud API and raises appropriate exceptions. If an error occurs during a request, it will be propagated to the caller with a descriptive error message. + +## Contributing + +Contributions to the Spider Cloud Rust SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. + +## License + +The Spider Cloud Rust SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). diff --git a/rust/src/lib.rs b/rust/src/lib.rs new file mode 100644 index 0000000..1ee2efa --- /dev/null +++ b/rust/src/lib.rs @@ -0,0 +1,883 @@ +use reqwest::Client; +use reqwest::{Error, Response}; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use tokio_stream::StreamExt; + +/// Structure representing the Chunking algorithm dictionary. +#[derive(Debug, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +struct ChunkingAlgDict { + /// The chunking algorithm to use, defined as a specific type. + r#type: ChunkingType, + /// The amount to chunk by. + value: i32, +} + +/// Enum representing different types of Chunking. +#[derive(Debug, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +enum ChunkingType { + /// By the word count. + ByWords, + /// By the line count. + ByLines, + /// By the char length. + ByCharacterLength, + /// By sentence. + BySentence, +} + +/// Structure representing request parameters. +#[derive(Debug, Default, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +struct RequestParams { + /// The URL to be crawled. + url: Option, + /// The type of request to be made. + request: Option, + /// The maximum number of pages the crawler should visit. + limit: Option, + /// The format in which the result should be returned. + return_format: Option, + /// Specifies whether to only visit the top-level domain. + tld: Option, + /// The depth of the crawl. + depth: Option, + /// Specifies whether the request should be cached. + cache: Option, + /// The budget for various resources. + budget: Option>, + /// The blacklist routes to ignore. This can be a Regex string pattern. + black_list: Option>, + /// The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing. + white_list: Option>, + /// The locale to be used during the crawl. + locale: Option, + /// The cookies to be set for the request, formatted as a single string. + cookies: Option, + /// Specifies whether to use stealth techniques to avoid detection. + stealth: Option, + /// The headers to be used for the request. + headers: Option>, + /// Specifies whether anti-bot measures should be used. + anti_bot: Option, + /// Specifies whether to include metadata in the response. + metadata: Option, + /// The dimensions of the viewport. + viewport: Option>, + /// The encoding to be used for the request. + encoding: Option, + /// Specifies whether to include subdomains in the crawl. + subdomains: Option, + /// The user agent string to be used for the request. + user_agent: Option, + /// Specifies whether the response data should be stored. + store_data: Option, + /// Configuration settings for GPT (general purpose texture mappings). + gpt_config: Option>, + /// Specifies whether to use fingerprinting protection. + fingerprint: Option, + /// Specifies whether to perform the request without using storage. + storageless: Option, + /// Specifies whether readability optimizations should be applied. + readability: Option, + /// Specifies whether to use a proxy for the request. + proxy_enabled: Option, + /// Specifies whether to respect the site's robots.txt file. + respect_robots: Option, + /// CSS selector to be used to filter the content. + query_selector: Option, + /// Specifies whether to load all resources of the crawl target. + full_resources: Option, + /// Specifies whether to use the sitemap links. + sitemap: Option, + /// Get page insights to determine information like request duration, accessibility, and other web vitals. Requires the `metadata` parameter to be set to `true`. + page_insights: Option, + /// Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`. + return_embeddings: Option, + /// The timeout for the request, in milliseconds. + request_timeout: Option, + /// Specifies whether to run the request in the background. + run_in_background: Option, + /// Specifies whether to skip configuration checks. + skip_config_checks: Option, + /// The chunking algorithm to use. + chunking_alg: Option, +} + +/// Enum representing different types of Requests. +#[derive(Debug, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +enum RequestType { + Http, + Chrome, + Smart, +} + +/// Enum representing different return formats. +#[derive(Debug, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +enum ReturnFormat { + Raw, + Markdown, + Commonmark, + Html2text, + Text, + Bytes, +} + +/// Represents a Spider with API key and HTTP client. +#[derive(Debug)] +struct Spider { + api_key: String, + client: Client, +} + +impl Spider { + /// Creates a new instance of Spider. + /// + /// # Arguments + /// + /// * `api_key` - An optional API key. + /// + /// # Returns + /// + /// A new instance of Spider or an error string if no API key is provided. + pub fn new(api_key: Option) -> Result { + let api_key = api_key.or_else(|| std::env::var("SPIDER_API_KEY").ok()); + match api_key { + Some(key) => Ok(Self { + api_key: key, + client: Client::new(), + }), + None => Err("No API key provided"), + } + } + + /// Sends a POST request to the API. + /// + /// # Arguments + /// + /// * `endpoint` - The API endpoint. + /// * `data` - The request data as a HashMap. + /// * `stream` - Whether streaming is enabled. + /// * `content_type` - The content type of the request. + /// + /// # Returns + /// + /// The response from the API. + async fn api_post( + &self, + endpoint: &str, + data: HashMap, + content_type: &str, + ) -> Result { + let url = format!("https://api.spider.cloud/{}", endpoint); + self.client + .post(&url) + .header("Content-Type", content_type) + .header("Authorization", format!("Bearer {}", self.api_key)) + .json(&data) + .send() + .await + } + + /// Sends a GET request to the API. + /// + /// # Arguments + /// + /// * `endpoint` - The API endpoint. + /// + /// # Returns + /// + /// The response from the API as a JSON value. + async fn api_get(&self, endpoint: &str) -> Result { + let url = format!("https://api.spider.cloud/{}", endpoint); + let res = self + .client + .get(&url) + .header("Content-Type", "application/json") + .header("Authorization", format!("Bearer {}", self.api_key)) + .send() + .await?; + res.json().await + } + + /// Scrapes a URL. + /// + /// # Arguments + /// + /// * `url` - The URL to scrape. + /// * `params` - Optional request parameters. + /// * `stream` - Whether streaming is enabled. + /// * `content_type` - The content type of the request. + /// + /// # Returns + /// + /// The response from the API as a JSON value. + pub async fn scrape_url( + &self, + url: &str, + params: Option, + stream: bool, + content_type: &str, + ) -> Result { + let mut data = HashMap::new(); + + data.insert( + "url".to_string(), + serde_json::Value::String(url.to_string()), + ); + data.insert("limit".to_string(), serde_json::Value::Number(1.into())); + + if let Ok(params) = serde_json::to_value(params) { + match params.as_object() { + Some(ref p) => { + let params_collect = p.iter().map(|(k, v)| (k.to_string(), v.clone())); + + data.extend(params_collect); + } + _ => (), + } + } + + let res = self.api_post("crawl", data, content_type).await?; + res.json().await + } + + /// Sends a DELETE request to the API. + /// + /// # Arguments + /// + /// * `endpoint` - The API endpoint. + /// * `params` - Optional request parameters. + /// * `stream` - Whether streaming is enabled. + /// * `content_type` - The content type of the request. + /// + /// # Returns + /// + /// The response from the API. + async fn api_delete( + &self, + endpoint: &str, + params: Option>, + stream: bool, + content_type: &str, + ) -> Result { + let url = format!("https://api.spider.cloud/v1/{}", endpoint); + let request_builder = self + .client + .delete(&url) + .header("Content-Type", content_type) + .header("Authorization", format!("Bearer {}", self.api_key)); + + let request_builder = if let Some(params) = params { + request_builder.json(¶ms) + } else { + request_builder + }; + + request_builder.send().await + } + + /// Crawls a URL. + /// + /// # Arguments + /// + /// * `url` - The URL to crawl. + /// * `params` - Optional request parameters. + /// * `stream` - Whether streaming is enabled. + /// * `content_type` - The content type of the request. + /// * `callback` - Optional callback function to handle each streamed chunk. + /// + /// # Returns + /// + /// The response from the API as a JSON value. + pub async fn crawl_url( + &self, + url: &str, + params: Option, + stream: bool, + content_type: &str, + callback: Option, + ) -> Result { + let mut data = HashMap::new(); + data.insert("url".into(), serde_json::Value::String(url.to_string())); + + if let Ok(params) = serde_json::to_value(params) { + match params.as_object() { + Some(ref p) => { + data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone()))); + } + _ => (), + } + } + + let res = self.api_post("crawl", data, content_type).await?; + + if stream { + if let Some(callback) = callback { + let stream = res.bytes_stream(); + tokio::pin!(stream); + + while let Some(item) = stream.next().await { + match item { + Ok(chunk) => match serde_json::from_slice(&chunk) { + Ok(json_obj) => { + callback(json_obj); + } + _ => (), + }, + Err(e) => { + eprintln!("Error in streaming response: {}", e); + } + } + } + Ok(serde_json::Value::Null) + } else { + Ok(serde_json::Value::Null) + } + } else { + res.json().await + } + } + + /// Fetches links from a URL. + /// + /// # Arguments + /// + /// * `url` - The URL to fetch links from. + /// * `params` - Optional request parameters. + /// * `stream` - Whether streaming is enabled. + /// * `content_type` - The content type of the request. + /// + /// # Returns + /// + /// The response from the API as a JSON value. + pub async fn links( + &self, + url: &str, + params: Option, + stream: bool, + content_type: &str, + ) -> Result { + let mut data = HashMap::new(); + data.insert("url".into(), serde_json::Value::String(url.to_string())); + if let Ok(params) = serde_json::to_value(params) { + match params.as_object() { + Some(ref p) => { + data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone()))); + } + _ => (), + } + } + + let res = self.api_post("links", data, content_type).await?; + res.json().await + } + + /// Takes a screenshot of a URL. + /// + /// # Arguments + /// + /// * `url` - The URL to take a screenshot of. + /// * `params` - Optional request parameters. + /// * `stream` - Whether streaming is enabled. + /// * `content_type` - The content type of the request. + /// + /// # Returns + /// + /// The response from the API as a JSON value. + pub async fn screenshot( + &self, + url: &str, + params: Option, + stream: bool, + content_type: &str, + ) -> Result { + let mut data = HashMap::new(); + data.insert("url".into(), serde_json::Value::String(url.to_string())); + + if let Ok(params) = serde_json::to_value(params) { + match params.as_object() { + Some(ref p) => { + data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone()))); + } + _ => (), + } + } + + let res = self.api_post("screenshot", data, content_type).await?; + res.json().await + } + + /// Searches for a query. + /// + /// # Arguments + /// + /// * `q` - The query to search for. + /// * `params` - Optional request parameters. + /// * `stream` - Whether streaming is enabled. + /// * `content_type` - The content type of the request. + /// + /// # Returns + /// + /// The response from the API as a JSON value. + pub async fn search( + &self, + q: &str, + params: Option, + stream: bool, + content_type: &str, + ) -> Result { + let mut data = HashMap::new(); + data.insert("search".into(), serde_json::Value::String(q.to_string())); + + if let Ok(params) = serde_json::to_value(params) { + match params.as_object() { + Some(ref p) => { + data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone()))); + } + _ => (), + } + } + let res = self.api_post("search", data, content_type).await?; + res.json().await + } + + /// Transforms data. + /// + /// # Arguments + /// + /// * `data` - The data to transform. + /// * `params` - Optional request parameters. + /// * `stream` - Whether streaming is enabled. + /// * `content_type` - The content type of the request. + /// + /// # Returns + /// + /// The response from the API as a JSON value. + pub async fn transform( + &self, + data: Vec>, + params: Option, + stream: bool, + content_type: &str, + ) -> Result { + let mut payload = HashMap::new(); + + payload.insert("data".into(), serde_json::to_value(data).unwrap()); + + if let Ok(params) = serde_json::to_value(params) { + match params.as_object() { + Some(ref p) => { + // data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone()))); + } + _ => (), + } + } + + let res = self.api_post("transform", payload, content_type).await?; + + res.json().await + } + + /// Extracts contacts from a URL. + /// + /// # Arguments + /// + /// * `url` - The URL to extract contacts from. + /// * `params` - Optional request parameters. + /// * `stream` - Whether streaming is enabled. + /// * `content_type` - The content type of the request. + /// + /// # Returns + /// + /// The response from the API as a JSON value. + pub async fn extract_contacts( + &self, + url: &str, + params: Option, + stream: bool, + content_type: &str, + ) -> Result { + let mut data = HashMap::new(); + + data.insert("url".into(), serde_json::to_value(url).unwrap()); + + if let Ok(params) = serde_json::to_value(params) { + match params.as_object() { + Some(ref p) => { + // data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone()))); + } + _ => (), + } + } + let res = self + .api_post("pipeline/extract-contacts", data, content_type) + .await?; + res.json().await + } + + /// Labels data from a URL. + /// + /// # Arguments + /// + /// * `url` - The URL to label data from. + /// * `params` - Optional request parameters. + /// * `stream` - Whether streaming is enabled. + /// * `content_type` - The content type of the request. + /// + /// # Returns + /// + /// The response from the API as a JSON value. + pub async fn label( + &self, + url: &str, + params: Option, + stream: bool, + content_type: &str, + ) -> Result { + let mut data = HashMap::new(); + data.insert("url".into(), serde_json::Value::String(url.to_string())); + + if let Ok(params) = serde_json::to_value(params) { + match params.as_object() { + Some(ref p) => { + // data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone()))); + } + _ => (), + } + } + + let res = self.api_post("pipeline/label", data, content_type).await?; + res.json().await + } + + /// Creates a signed URL. + /// + /// # Arguments + /// + /// * `domain` - Optional domain. + /// * `options` - Optional options. + /// * `stream` - Whether streaming is enabled. + /// + /// # Returns + /// + /// The response from the API. + pub async fn create_signed_url( + &self, + domain: Option<&str>, + options: Option>, + ) -> Result { + let mut params = HashMap::new(); + + if let Some(domain) = domain { + params.insert("domain".to_string(), domain.to_string()); + } + + if let Some(options) = options { + for (key, value) in options { + params.insert(key.to_string(), value.to_string()); + } + } + + let url = format!("https://api.spider.cloud/v1/data/storage"); + let request = self + .client + .get(&url) + .header("Content-Type", "application/octet-stream") + .header("Authorization", format!("Bearer {}", self.api_key)) + .query(¶ms); + + let res = request.send().await?; + + Ok(res) + } + + /// Gets the crawl state of a URL. + /// + /// # Arguments + /// + /// * `url` - The URL to get the crawl state of. + /// * `params` - Optional request parameters. + /// * `stream` - Whether streaming is enabled. + /// * `content_type` - The content type of the request. + /// + /// # Returns + /// + pub async fn get_crawl_state( + &self, + url: &str, + params: Option, + stream: bool, + content_type: &str, + ) -> Result { + let mut payload = HashMap::new(); + payload.insert("url".into(), serde_json::Value::String(url.to_string())); + payload.insert( + "contentType".into(), + serde_json::Value::String(content_type.to_string()), + ); + + if let Ok(params) = serde_json::to_value(params) { + match params.as_object() { + Some(ref p) => { + // data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone()))); + } + _ => (), + } + } + let res = self + .api_post("data/crawl_state", payload, content_type) + .await?; + res.json().await + } + + pub async fn get_credits(&self) -> Result { + self.api_get("data/credits").await + } + + pub async fn data_post( + &self, + table: &str, + data: Option, + ) -> Result { + let mut payload = HashMap::new(); + + if let Ok(params) = serde_json::to_value(data) { + match params.as_object() { + Some(ref p) => { + // data.extend(p.iter().map(|(k, v)| (k.to_string(), v.clone()))); + } + _ => (), + } + } + + let res = self + .api_post(&format!("data/{}", table), payload, "application/json") + .await?; + res.json().await + } + + pub async fn data_get( + &self, + table: &str, + params: Option, + ) -> Result { + let mut payload = HashMap::new(); + + if let Some(params) = params { + let params = serde_json::to_value(params).unwrap(); + payload.extend( + params + .as_object() + .unwrap() + .iter() + .map(|(k, v)| (k.as_str(), v.clone())), + ); + } + + let res = self.api_get(&format!("data/{}", table)).await?; + Ok(res) + } + + pub async fn data_delete( + &self, + table: &str, + params: Option, + ) -> Result { + let mut payload = HashMap::new(); + + if let Some(params) = params { + let params = serde_json::to_value(params).unwrap(); + // let params = params + // .as_object() + // .unwrap() + // .iter() + // .map(|(k, v)| (k.as_str(), v.clone())); + + // payload.extend(params); + } + + let res = self + .api_delete( + &format!("data/{}", table), + Some(payload), + false, + "application/json", + ) + .await?; + res.json().await + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // Helper function to create a Spider instance for tests + fn create_spider() -> Spider { + Spider::new(Some("test_api_key".to_string())).unwrap() + } + + #[test] + fn test_new_spider_with_api_key() { + let spider = Spider::new(Some("test_api_key".to_string())); + assert!(spider.is_ok()); + } + + #[test] + fn test_new_spider_without_api_key() { + std::env::set_var("SPIDER_API_KEY", "test_api_key"); + let spider = Spider::new(None); + assert!(spider.is_ok()); + } + + #[test] + fn test_new_spider_no_api_key() { + std::env::remove_var("SPIDER_API_KEY"); + let spider = Spider::new(None); + assert!(spider.is_err()); + } + + #[tokio::test] + async fn test_api_post() { + let spider = create_spider(); + let mut data = HashMap::new(); + data.insert("key".into(), serde_json::Value::String("value".to_string())); + let response = spider.api_post("endpoint", data, "application/json").await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_api_get() { + let spider = create_spider(); + let response = spider.api_get("endpoint").await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_scrape_url() { + let spider = create_spider(); + let response = spider + .scrape_url("https://example.com", None, false, "application/json") + .await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_crawl_url() { + let spider = create_spider(); + let response = spider + .crawl_url( + "https://example.com", + None, + false, + "application/json", + None::, + ) + .await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_links() { + let spider = create_spider(); + let response = spider + .links("https://example.com", None, false, "application/json") + .await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_screenshot() { + let spider = create_spider(); + let response = spider + .screenshot("https://example.com", None, false, "application/json") + .await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_search() { + let spider = create_spider(); + let response = spider + .search("query", None, false, "application/json") + .await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_transform() { + let spider = create_spider(); + let data = vec![HashMap::new()]; + let response = spider + .transform(data, None, false, "application/json") + .await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_extract_contacts() { + let spider = create_spider(); + let response = spider + .extract_contacts("https://example.com", None, false, "application/json") + .await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_label() { + let spider = create_spider(); + let response = spider + .label("https://example.com", None, false, "application/json") + .await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_create_signed_url() { + let spider = create_spider(); + let response = spider.create_signed_url(Some("example.com"), None).await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_get_crawl_state() { + let spider = create_spider(); + let response = spider + .get_crawl_state("https://example.com", None, false, "application/json") + .await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_get_credits() { + let spider = create_spider(); + let response = spider.get_credits().await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_data_post() { + let spider = create_spider(); + let response = spider.data_post("table", None).await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_data_get() { + let spider = create_spider(); + let response = spider.data_get("table", None).await; + assert!(response.is_ok()); + } + + #[tokio::test] + async fn test_data_delete() { + let spider = create_spider(); + let response = spider.data_delete("table", None).await; + assert!(response.is_ok()); + } +}