Skip to content

Commit

Permalink
Merge branch 'release/v0.3.1'
Browse files Browse the repository at this point in the history
  • Loading branch information
ja573 committed May 31, 2024
2 parents 65bc0e8 + 25f89d6 commit c891ac8
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 7 deletions.
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "archive-pdf-urls"
version = "0.3.0"
version = "0.3.1"
authors = ["Javier Arias <[email protected]>"]
edition = "2021"
license = "Apache-2.0"
Expand All @@ -18,4 +18,4 @@ log = "0.4.21"
lopdf = "0.32.0"
regex = "1.10.4"
tokio = { version = "1.36.0", features = ["full"] }
waybackmachine-client = { version = "0.3.0", path = "waybackmachine-client"}
waybackmachine-client = { version = "=0.3.1", path = "waybackmachine-client"}
5 changes: 4 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use clap::{crate_authors, crate_version, Arg, ArgAction, Command};
use log::{error, info};
use log::{error, info, warn};
use lopdf::{Dictionary, Document, Object};
use regex::Regex;
use std::collections::HashSet;
Expand Down Expand Up @@ -48,6 +48,9 @@ async fn main() {
.collect();

let links_set = extract_links(doc);
if links_set.is_empty() {
warn!("No page annotations found in this PDF file");
}
let client = WaybackMachineClient::new(ClientConfig::default());

let mut exit_code = 0;
Expand Down
2 changes: 1 addition & 1 deletion waybackmachine-client/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "waybackmachine-client"
version = "0.3.0"
version = "0.3.1"
authors = ["Javier Arias <[email protected]>"]
edition = "2021"
license = "Apache-2.0"
Expand Down
19 changes: 19 additions & 0 deletions waybackmachine-client/src/archivableurl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ impl ArchivableUrl {
Host::Domain(domain) if domain.contains("archive.org") => {
return Err(Error::ExcludedUrl(self.url.to_string()));
}
Host::Domain(domain) if domain.contains("jstor.org") => {
return Err(Error::ExcludedUrl(self.url.to_string()));
}
Host::Ipv4(ipv4)
if ipv4.is_loopback()
|| ipv4.is_private()
Expand Down Expand Up @@ -134,4 +137,20 @@ mod tests {
assert!(result.is_err());
assert_eq!(result.err(), Some(Error::InvalidUrl(url.to_string())));
}

#[test]
fn wayback_url() {
let url = "https://archive.org/some-book";
let result = ArchivableUrl::parse(url);
assert!(result.is_err());
assert_eq!(result.err(), Some(Error::ExcludedUrl(url.to_string())));
}

#[test]
fn jstor_url() {
let url = "https://jstor.org/some-book";
let result = ArchivableUrl::parse(url);
assert!(result.is_err());
assert_eq!(result.err(), Some(Error::ExcludedUrl(url.to_string())));
}
}
2 changes: 1 addition & 1 deletion waybackmachine-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use serde::Deserialize;
use url::Url;

/// Maximum number of allowed request retries attempts.
const DEFAULT_MAX_REQUEST_RETRIES: u32 = 5;
const DEFAULT_MAX_REQUEST_RETRIES: u32 = 10;

/// Default threshold for considering an archive as recent, in days.
/// URLs with archives older than this threshold will be re-archived.
Expand Down

0 comments on commit c891ac8

Please sign in to comment.