Skip to content

Commit

Permalink
added all sports on site
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikolai Schimke authored and Nikolai Schimke committed Feb 1, 2024
1 parent 4a69c7f commit f6fb996
Show file tree
Hide file tree
Showing 9 changed files with 107 additions and 89 deletions.
3 changes: 2 additions & 1 deletion migrations/2023-09-16-190301_create_streams/up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ CREATE TABLE "stream" (
id INTEGER PRIMARY KEY AUTOINCREMENT,
home TEXT NOT NULL,
away TEXT NOT NULL,
start_time DATETIME NOT NULL,
start_time TIMESTAMP NOT NULL,
league TEXT NOT NULL,
country TEXT NOT NULL,
url TEXT NOT NULL,
stream_link TEXT NOT NULL,
sport TEXT NOT NULL,
UNIQUE(url, home, away, start_time)
);
Binary file modified sports.db
Binary file not shown.
6 changes: 4 additions & 2 deletions src/bin/sportshub.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ use scraper::{db, scrape_utils, web_server_utils};
pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations");

fn run_migrations(connection: &mut impl MigrationHarness<Sqlite>) -> Result<(), Error> {
connection.revert_all_migrations(MIGRATIONS).unwrap();
println!("Reverted all migrations");
connection.run_pending_migrations(MIGRATIONS).unwrap();

let mut conn = db::helpers::establish_connection()?;
db::helpers::delete_all_past_streams(&mut conn)?;

Ok(())
}

Expand Down Expand Up @@ -48,6 +49,7 @@ async fn main() {
scrape_utils::start_scraping(tabs).unwrap();
}
Some(Commands::Server { port }) => {
run_migrations(&mut conn).unwrap();
web_server_utils::run(port).await;
}
None => {
Expand Down
1 change: 1 addition & 0 deletions src/constants/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod sports;
32 changes: 18 additions & 14 deletions src/db/helpers.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
//! Database operation helpers for sqlite, using diesel
use std::time::{Duration, Instant};

use diesel::{prelude::*, RunQueryDsl};

use super::{
Expand All @@ -14,30 +16,32 @@ pub fn establish_connection() -> Result<SqliteConnection, anyhow::Error> {
Ok(SqliteConnection::establish(&database_url)?)
}

pub fn create_stream(conn: &mut SqliteConnection,
new_stream: &StreamNew)
-> Result<usize, anyhow::Error> {
Ok(diesel::insert_or_ignore_into(stream::table).values(new_stream)
.execute(conn)?)
pub fn create_stream(conn: &mut SqliteConnection, new_stream: &StreamNew) -> Result<usize, anyhow::Error> {
Ok(diesel::insert_or_ignore_into(stream::table)
.values(new_stream)
.execute(conn)?)
}

pub fn get_streams(conn: &mut SqliteConnection) -> Result<Vec<Stream>, anyhow::Error> {
Ok(stream.load::<Stream>(conn)?)
}

pub fn get_empty_streams(conn: &mut SqliteConnection) -> Result<Vec<Stream>, anyhow::Error> {
Ok(stream.filter(schema::stream::stream_link.eq(""))
.load::<Stream>(conn)?)
Ok(stream.filter(schema::stream::stream_link.eq("")).load::<Stream>(conn)?)
}

pub fn get_linked_streams(conn: &mut SqliteConnection) -> Result<Vec<Stream>, anyhow::Error> {
Ok(stream.filter(schema::stream::stream_link.ne(""))
.load::<Stream>(conn)?)
Ok(stream.filter(schema::stream::stream_link.ne("")).load::<Stream>(conn)?)
}

pub fn get_streams_by_id(conn: &mut SqliteConnection, search_id: i32) -> Result<Vec<Stream>, anyhow::Error> {
Ok(stream.filter(schema::stream::id.eq(search_id)).load::<Stream>(conn)?)
}

pub fn get_streams_by_id(conn: &mut SqliteConnection,
search_id: i32)
-> Result<Vec<Stream>, anyhow::Error> {
Ok(stream.filter(schema::stream::id.eq(search_id))
.load::<Stream>(conn)?)
pub fn delete_all_past_streams(conn: &mut SqliteConnection) -> Result<usize, anyhow::Error> {
println!("Deleting all 3+ hour past streams...");
Ok(
diesel::delete(stream.filter(start_time.le(chrono::Utc::now().naive_utc() - Duration::from_secs(3 * 60 * 60))))
.execute(conn)?,
)
}
6 changes: 6 additions & 0 deletions src/db/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pub struct Stream {
pub country: String,
pub url: String,
pub stream_link: String,
pub sport: String,
}

#[derive(Debug, Insertable, Clone)]
Expand All @@ -30,6 +31,7 @@ pub struct StreamNew<'a> {
pub country: &'a str,
pub url: &'a str,
pub stream_link: &'a str,
pub sport: &'a str,
}

impl Serialize for Stream {
Expand All @@ -47,6 +49,7 @@ impl Serialize for Stream {
stream.serialize_field("country", &self.country)?;
stream.serialize_field("url", &self.url)?;
stream.serialize_field("stream_link", &split_streams)?;
stream.serialize_field("sport", &self.sport)?;
stream.end()
}
}
Expand All @@ -68,6 +71,7 @@ mod tests {
country: "country".to_string(),
url: "url".to_string(),
stream_link: "stream_link".to_string(),
sport: "sport".to_string(),
};

let serialised = serde_json::to_string(&stream).unwrap();
Expand All @@ -86,6 +90,7 @@ mod tests {
country: "country".to_string(),
url: "url".to_string(),
stream_link: "stream_link,stream_link2".to_string(),
sport: "sport".to_string(),
};

let serialised = serde_json::to_string(&stream).unwrap();
Expand All @@ -104,6 +109,7 @@ mod tests {
country: "country".to_string(),
url: "url".to_string(),
stream_link: "".to_string(),
sport: "sport".to_string(),
};

let serialised = serde_json::to_string(&stream).unwrap();
Expand Down
1 change: 1 addition & 0 deletions src/db/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ diesel::table! {
country -> Text,
url -> Text,
stream_link -> Text,
sport -> Text,
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod constants;
pub mod date_parser;
pub mod db;
pub mod query_selectors;
Expand Down
146 changes: 74 additions & 72 deletions src/scrape_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,51 @@ use db::{models, schema};
use diesel::{ExpressionMethods, RunQueryDsl, SqliteConnection};
use headless_chrome::{Browser, Tab};

use crate::{db, query_selectors};
use crate::{
constants::sports::{self, Sport},
db,
query_selectors,
};


pub fn start_scraping(open_tabs: usize) -> Result<(), anyhow::Error> {
// realised we didnt need adblocker when headless
let browser = Browser::new({
headless_chrome::LaunchOptions {
headless: false,
sandbox: true,
ignore_certificate_errors: true,
..Default::default()
}
})?;

let mut conn = db::helpers::establish_connection()?;

let tab = browser.new_tab()?;


for sport in sports::SPORTS.iter() {
today_games(&tab, &mut conn, sport)?;
}


// we get all the links from database that don't have stream links
// and we check them in parallel
// my 8gb ram m1 macbook air can handle 10 tabs relatively easily
// takes ~27 seconds to scan everything
// however can improve by using a shared queue instead of splitting it
// so... TODO!
check_all_links(&browser, &mut conn, open_tabs)?;

// we close all the tabs because otherwise it shows an error when program
// finishes
for t in (*browser.get_tabs().as_ref().lock().unwrap()).iter() {
t.close(true)?;
}

Ok(())
}


/// This function scrapes all the games from the home page and saves them to database.
/// It takes roughly 1 second to scrape ~500 games.
Expand All @@ -22,16 +66,29 @@ use crate::{db, query_selectors};
/// # Arguments
/// *tab* - is the tab that we use to navigate to the page and scrape the games, we use headless_chrome tabs.
/// *conn* - is the connection to the database, we use diesel to save the games to database.
pub fn today_games(tab: &Tab, conn: &mut SqliteConnection) -> Result<(), anyhow::Error> {
pub fn today_games(tab: &Tab, conn: &mut SqliteConnection, sport: &Sport) -> Result<(), anyhow::Error> {
// we navigate to the page and wait until the table showing links is loaded
tab.navigate_to("https://reddit.sportshub.fan/")?
.wait_for_element(".list-events")?;
// not my typo, they actually named it "shedule"
tab.navigate_to(sport.url)?.wait_for_element("#sports-shedule")?;

// we get the html of the table and remove all the tabs and newlines
let html = tab
.find_element(".list-events")?
.get_content()?
.replace(['\t', '\n'], "");


println!("Parsing {}", &sport.name);

let html = tab.find_element("#sports-shedule");

if let Err(e) = html {
println!("Error: {}", e);
return Ok(());
}

let html = html.unwrap().get_content()?.replace(['\t', '\n'], "");

if html.is_empty() {
return Ok(());
}


// create the parser using tl
let dom = tl::parse(&html, tl::ParserOptions::default())?;
Expand All @@ -43,7 +100,7 @@ pub fn today_games(tab: &Tab, conn: &mut SqliteConnection) -> Result<(), anyhow:
// we iterate over all the games and parse them
for game in dom_games {
if let Some(x) = game.get(parser) {
parse_game(conn, &x.inner_html(parser).to_string())?;
parse_game(conn, &sport.name, &x.inner_html(parser).to_string())?;
}
}

Expand All @@ -54,7 +111,7 @@ pub fn today_games(tab: &Tab, conn: &mut SqliteConnection) -> Result<(), anyhow:
/// It takes roughly 400µs to parse a single game. (± 100µs)
///
/// This should never panic
pub fn parse_game(conn: &mut SqliteConnection, html: &str) -> Result<(), anyhow::Error> {
pub fn parse_game(conn: &mut SqliteConnection, sport: &str, html: &str) -> Result<(), anyhow::Error> {
// creating a new parser for each game is not the best idea, but it's not a problem
// because it takes roughly 400µs to parse a single game
let dom = tl::parse(html, tl::ParserOptions::default())?;
Expand Down Expand Up @@ -100,6 +157,7 @@ pub fn parse_game(conn: &mut SqliteConnection, html: &str) -> Result<(), anyhow:
country: &country.trim(),
url: &url.trim(),
stream_link: "",
sport,
};

db::helpers::create_stream(conn, &new_stream)?;
Expand All @@ -114,11 +172,7 @@ pub fn parse_game(conn: &mut SqliteConnection, html: &str) -> Result<(), anyhow:
/// *tab* - is the tab that we use to navigate to the page and scrape the links, we use headless_chrome tabs.
/// *conn* - is the connection to the database, we use diesel to save the links to database.
/// *url* - is the url of the game page that we get from database.
pub fn url_to_links(
tab: &Tab,
conn: &mut SqliteConnection,
url: &str,
) -> Result<(), anyhow::Error> {
pub fn url_to_links(tab: &Tab, conn: &mut SqliteConnection, url: &str) -> Result<(), anyhow::Error> {
tab.navigate_to(url)?.wait_for_element("#content-event")?;

// they encode url, so we need to decode it
Expand Down Expand Up @@ -159,18 +213,13 @@ pub fn url_to_links(
/// It takes roughly 27 seconds to check all the links.
/// (My 8gb ram m1 macbook air with a 90mbps internet connection can handle 10 tabs relatively easily)
/// It can be improved by using a shared queue instead of splitting it.
pub fn check_all_links(
browser: &Browser,
conn: &mut SqliteConnection,
tabs_count: usize,
) -> Result<(), anyhow::Error> {
pub fn check_all_links(browser: &Browser, conn: &mut SqliteConnection, tabs_count: usize) -> Result<(), anyhow::Error> {
// we get all the streams from database that have no links
// wrap it in an arc to share it between threads
let all_streams = Arc::new(db::helpers::get_empty_streams(conn)?);

// we split the streams into chunks and create a thread for each chunk
let chunked_streams: Vec<&[models::Stream]> =
all_streams.chunks(all_streams.len() / tabs_count).collect();
let chunked_streams: Vec<&[models::Stream]> = all_streams.chunks(all_streams.len() / tabs_count).collect();

let length = all_streams.len();

Expand Down Expand Up @@ -203,9 +252,7 @@ pub fn check_all_links(
while let Some(stream) = streams.pop() {
check_link(tab.clone().borrow_mut(), &mut conn, &stream.url).unwrap();
// we print the progress
let mut completed_count = completed
.lock()
.expect("mutex is already opened by current thread");
let mut completed_count = completed.lock().expect("mutex is already opened by current thread");
*completed_count += 1;
println!("{} / {}", completed_count, length);
}
Expand All @@ -221,58 +268,13 @@ pub fn check_all_links(

let time_end = std::time::Instant::now();

println!(
"Time elapsed to scan all games: {:?}",
time_end - time_start
);
println!("Time elapsed to scan all games: {:?}", time_end - time_start);

Ok(())
}

pub fn check_link(
tab: &mut Arc<Tab>,
conn: &mut SqliteConnection,
link: &str,
) -> Result<(), anyhow::Error> {
pub fn check_link(tab: &mut Arc<Tab>, conn: &mut SqliteConnection, link: &str) -> Result<(), anyhow::Error> {
url_to_links(tab.borrow_mut(), conn.borrow_mut(), link).unwrap();

Ok(())
}

pub fn start_scraping(open_tabs: usize) -> Result<(), anyhow::Error> {
// realised we didnt need adblocker when headless
let browser = Browser::new({
headless_chrome::LaunchOptions {
headless: true,
sandbox: true,
ignore_certificate_errors: true,
..Default::default()
}
})?;

let mut conn = db::helpers::establish_connection()?;

let tab = browser.new_tab()?;

// we get to the page with all the links for upcoming games
// this will scrape ~500 games and save them to database
// takes roughly 1 second, but it's not a problem, because
// we do it only once a day
today_games(&tab, &mut conn)?;

// we get all the links from database that don't have stream links
// and we check them in parallel
// my 8gb ram m1 macbook air can handle 10 tabs relatively easily
// takes ~27 seconds to scan everything
// however can improve by using a shared queue instead of splitting it
// so... TODO!
check_all_links(&browser, &mut conn, open_tabs)?;

// we close all the tabs because otherwise it shows an error when program
// finishes
for t in (*browser.get_tabs().as_ref().lock().unwrap()).iter() {
t.close(true)?;
}

Ok(())
}

0 comments on commit f6fb996

Please sign in to comment.