Skip to content

Commit

Permalink
Merge branch 'release/v0.4.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
ja573 committed Jun 17, 2024
2 parents c891ac8 + f2c00ac commit fb4ab84
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 13 deletions.
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "archive-pdf-urls"
version = "0.3.1"
version = "0.4.0"
authors = ["Javier Arias <[email protected]>"]
edition = "2021"
license = "Apache-2.0"
Expand All @@ -18,4 +18,4 @@ log = "0.4.21"
lopdf = "0.32.0"
regex = "1.10.4"
tokio = { version = "1.36.0", features = ["full"] }
waybackmachine-client = { version = "=0.3.1", path = "waybackmachine-client"}
waybackmachine-client = { version = "=0.4.0", path = "waybackmachine-client"}
2 changes: 1 addition & 1 deletion waybackmachine-client/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "waybackmachine-client"
version = "0.3.1"
version = "0.4.0"
authors = ["Javier Arias <[email protected]>"]
edition = "2021"
license = "Apache-2.0"
Expand Down
31 changes: 23 additions & 8 deletions waybackmachine-client/src/archivableurl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ pub struct ArchivableUrl {
pub url: Url,
}

/// List of domains that block wayback requests
const EXCLUDED_DOMAINS: &[&str] = &["archive.org", "jstor.org", "diw.de"];

impl ArchivableUrl {
/// Parses and validates the URL for archiving
pub fn parse(url: &str) -> Result<Self, Error> {
Expand All @@ -24,14 +27,16 @@ impl ArchivableUrl {

// Check if the host is excluded
match host {
Host::Domain(domain) if domain.contains("localhost") => {
return Err(Error::InvalidUrl(self.url.to_string()));
}
Host::Domain(domain) if domain.contains("archive.org") => {
return Err(Error::ExcludedUrl(self.url.to_string()));
}
Host::Domain(domain) if domain.contains("jstor.org") => {
return Err(Error::ExcludedUrl(self.url.to_string()));
Host::Domain(domain) => {
if domain.contains("localhost") {
return Err(Error::InvalidUrl(self.url.to_string()));
}

for &pattern in EXCLUDED_DOMAINS {
if domain.contains(pattern) {
return Err(Error::ExcludedUrl(self.url.to_string()));
}
}
}
Host::Ipv4(ipv4)
if ipv4.is_loopback()
Expand Down Expand Up @@ -153,4 +158,14 @@ mod tests {
assert!(result.is_err());
assert_eq!(result.err(), Some(Error::ExcludedUrl(url.to_string())));
}

#[test]
fn excluded_domains() {
for &domain in EXCLUDED_DOMAINS {
let url = format!("https://{}/some-path", domain);
let result = ArchivableUrl::parse(&url);
assert!(result.is_err());
assert_eq!(result.err(), Some(Error::ExcludedUrl(url)));
}
}
}

0 comments on commit fb4ab84

Please sign in to comment.