diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..b75d634 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +dataset.csv +model.json +target \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 019cd4c..5729882 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ edition = "2021" bayespam = "1.1.0" csv = "1.2.1" lazy_static = "1.4.0" -rocket = {version = "=0.5.0-rc.3", features = ["json"]} +rocket = {version = "0.5.0-rc.3", features = ["json"]} serde = "1.0.160" [profile.release] @@ -19,4 +19,4 @@ opt-level = 3 strip = true overflow-checks = false panic = "abort" -incremental = true \ No newline at end of file +incremental = true diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..012ef14 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +FROM rust as rust-builder + +WORKDIR /usr/src/app +COPY ./Cargo.toml . +COPY ./Cargo.lock . + +# For caching dependencies and avoid rebuilding them +RUN mkdir ./src && echo 'fn main() { println!("Dummy!"); }' > ./src/main.rs +RUN cargo build --release +RUN rm -rf ./src +COPY ./src ./src +RUN touch -a -m ./src/main.rs +RUN cargo build --release + +FROM debian:buster-slim +COPY --from=rust-builder /usr/src/app/target/release/tg_antispam_rs /usr/local/bin/ +WORKDIR /usr/local/bin + +RUN apt-get update +RUN apt-get install wget -y +RUN wget "https://huggingface.co/datasets/thehamkercat/telegram-spam-ham/raw/main/dataset.csv" + +CMD ["tg_antispam_rs"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3d26ce1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 TheHamkerCat + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..aee2349 --- /dev/null +++ b/README.md @@ -0,0 +1,112 @@ +## Telegram Antispam API +#### Blazingly fast spam classification API built using [Rocket](https://github.com/SergioBenitez/Rocket) Web Framework. + +![Rust](https://forthebadge.com/images/badges/made-with-rust.svg) + +#### Notes +- The classifier works in aggressive mode, it can sometimes classify non-spam messages/emails as spam (when the input is too small) +- The dataset provided may contain some NSFW texts or personal info, it's not thoroughly checked. +- I've included a docker-based example, but you can run it without docker as well. +- `profanity` in response is only there to maintain backwards compatibility for ARQ users, it always returns `false` + +## Installation: +I would suggest using docker compose for this, but it's upto you! +### With Docker compose + +```sh +$ git clone https://github.com/thehamkercat/telegram-antispam-rs +$ cd telegram-antispam-rs +$ docker-compose build +$ docker-compose up +``` + +### With Cargo + +```sh +$ git clone https://github.com/thehamkercat/telegram-antispam-rs +$ cd telegram-antispam-rs +$ cargo run --release +``` + +## Endpoints: + +```http +POST /spam_check HTTP/1.1 +Host: localhost:8000 +Content-Type: application/json + +{ + "text": "subscribe to my youtube channel" +} + +HTTP/1.1 200 OK +content-length: 59 +content-type: application/json + +{ + "spam": 99, + "ham": 1, + "is_spam": true, + "profanity": false, + "spam_probability": 99 +} +``` + +## Usage example: +### Python + +```python +import requests + +url = "http://localhost:8000/spam_check" +data = {"text": "subscribe to my youtube channel"} + +result = requests.post(url, json=data).json() + +print("Is spam:", result["is_spam"]) +print("Spam probability:", result["spam_probability"]) +``` + +### Rust + +```rust +use std::collections::HashMap; +use serde::{Deserialize, Serialize}; + + +#[derive(Serialize, Deserialize, Debug)] +struct Resp { + spam: u8, + ham: u8, + is_spam: bool, + spam_probability: u8, + profanity: bool +} + +#[tokio::main] +async fn main(){ + let client = reqwest::Client::new(); + let mut map = HashMap::new(); + map.insert("text", "Hello please subscribe to my youtube channel!"); + + let res = client + .post("http://localhost:8000/spam_check") + .json(&map) + .send() + .await + .unwrap(); + + let text_response = res.text().await.unwrap(); + + let json: Result = serde_json::from_str(text_response.as_str()); + if json.is_ok() { + println!("{:?}", json.unwrap()); + } +} + +// [dependencies] +// reqwest = { version = "0.11", features = ["json"] } +// serde = {version = "1.0.160", features = ["derive"]} +// serde_json = "1.0.96" +// tokio = { version = "1", features = ["full"] } +``` \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..9dc8e2a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,7 @@ +version: '3' +services: + tgrs: + build: . + restart: always + ports: + - '8000:8000' \ No newline at end of file diff --git a/src/spam_check.rs b/src/spam_check.rs index e99ee72..c99be16 100644 --- a/src/spam_check.rs +++ b/src/spam_check.rs @@ -10,7 +10,7 @@ lazy_static! { ).unwrap() }; } -static SPAM_PROB_THRESHOLD: u8 = 70; +static SPAM_PROB_THRESHOLD: u8 = 85; #[derive(Deserialize)] #[serde(crate = "rocket::serde")] @@ -27,7 +27,7 @@ fn spam_check(task: Json>) -> Value { let mut ham = 100 - spam; // Short messages are mostly ham - if spam > ham && task.text.len() < 20 { + if spam > ham && task.text.len() < 30 { spam -= 25; ham += 25; }